Python streaming_bulk Beispiele, elasticsearch.helpers.streaming_bulk Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_helpers.py Projekt: nside/elasticsearch-py

    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, '_id': x} for x in range(100)]
        for ok, item in helpers.streaming_bulk(self.client, docs, index='test-index', doc_type='answers', refresh=True):
            self.assertTrue(ok)

        self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count'])
        self.assertEquals({"answer": 42}, self.client.get(index='test-index', doc_type='answers', id=42)['_source'])

Beispiel #2

0

Datei anzeigen

Datei: test_helpers.py Projekt: elastic/elasticsearch-py

    def test_rejected_documents_are_retried_at_most_max_retries_times(self):
        failing_client = FailingBulkClient(
            self.client, fail_at=(1, 2), fail_with=TransportError(429, "Rejected!", {})
        )

        docs = [
            {"_index": "i", "_type": "_doc", "_id": 47, "f": "v"},
            {"_index": "i", "_type": "_doc", "_id": 45, "f": "v"},
            {"_index": "i", "_type": "_doc", "_id": 42, "f": "v"},
        ]
        results = list(
            helpers.streaming_bulk(
                failing_client,
                docs,
                raise_on_exception=False,
                raise_on_error=False,
                chunk_size=1,
                max_retries=1,
                initial_backoff=0,
            )
        )
        self.assertEquals(3, len(results))
        self.assertEquals([False, True, True], [r[0] for r in results])
        self.client.indices.refresh(index="i")
        res = self.client.search(index="i")
        self.assertEquals({"value": 2, "relation": "eq"}, res["hits"]["total"])
        self.assertEquals(4, failing_client._called)

Beispiel #3

0

Datei anzeigen

Datei: testpopulate.py Projekt: dionizh/superbookie

def load_data(client, data, index='booklist'):

	create_index(client, index)

	list_name = '1001books'

	# we let the streaming bulk continuously process the commits as they come
    # in - since the `parse_commits` function is a generator this will avoid
    # loading all the commits into memory
	for ok, result in streaming_bulk(
            client,
            parse_list(data, list_name),
            index=index,
            doc_type='book',
            chunk_size=50 # keep the batch sizes small for appearances only
		):

		action, result = result.popitem()
        doc_id = '/%s/%s' % (index, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print('Successfully indexed %s' % doc_id)

Beispiel #4

0

Datei anzeigen

Datei: test_helpers.py Projekt: luizgpsantos/elasticsearch-py

    def test_transport_error_can_becaught(self):
        failing_client = FailingBulkClient(self.client)
        docs = [
            {'_index': 'i', '_type': 't', '_id': 47, 'f': 'v'},
            {'_index': 'i', '_type': 't', '_id': 45, 'f': 'v'},
            {'_index': 'i', '_type': 't', '_id': 42, 'f': 'v'},
        ]

        results = list(helpers.streaming_bulk(failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1))
        self.assertEquals(3, len(results))
        self.assertEquals([True, False, True], [r[0] for r in results])

        exc = results[1][1]['index'].pop('exception')
        self.assertIsInstance(exc, TransportError)
        self.assertEquals(599, exc.status_code)
        self.assertEquals(
            {
                'index': {
                    '_index': 'i',
                    '_type': 't',
                    '_id': 45,
                    'data': {'f': 'v'},
                    'error': "TransportError(599, 'Error!')",
                    'status': 599
                }
            },
            results[1][1]
        )

Beispiel #5

0

Datei anzeigen

Datei: daemon.py Projekt: aaxelb/SHARE

    def _index_loop(self):
        try:
            while not self.should_stop:
                msgs = []
                actions = self._actions(250, msgs)

                stream = helpers.streaming_bulk(
                    self.es_client,
                    actions,
                    max_chunk_bytes=self.MAX_CHUNK_BYTES,
                    raise_on_error=False,
                )

                start = time.time()
                for (ok, resp), msg in zip(stream, msgs):
                    if not ok and not (resp.get('delete') and resp['delete']['status'] == 404):
                        raise ValueError(ok, resp, msg)
                    assert len(resp.values()) == 1
                    _id = list(resp.values())[0]['_id']
                    assert msg.payload['ids'] == [util.IDObfuscator.decode_id(_id)], '{} {}'.format(msg.payload, util.IDObfuscator.decode_id(_id))
                    msg.ack()
                if len(msgs):
                    logger.info('%r: Indexed %d documents in %.02fs', self, len(msgs), time.time() - start)
                else:
                    logger.debug('%r: Recieved no messages for %.02fs', self, time.time() - start)
        except Exception as e:
            client.captureException()
            logger.exception('%r: _index_loop encountered an unexpected error', self)
            self.stop()

Beispiel #6

0

Datei anzeigen

Datei: impl_elasticsearch.py Projekt: andymcc/ceilometer

    def record_events(self, events):

        def _build_bulk_index(event_list):
            for ev in event_list:
                traits = {t.name: t.value for t in ev.traits}
                yield {'_op_type': 'create',
                       '_index': '%s_%s' % (self.index_name,
                                            ev.generated.date().isoformat()),
                       '_type': ev.event_type,
                       '_id': ev.message_id,
                       '_source': {'timestamp': ev.generated.isoformat(),
                                   'traits': traits,
                                   'raw': ev.raw}}

        error = None
        for ok, result in helpers.streaming_bulk(
                self.conn, _build_bulk_index(events)):
            if not ok:
                __, result = result.popitem()
                if result['status'] == 409:
                    LOG.info(_LI('Duplicate event detected, skipping it: %s'),
                             result)
                else:
                    LOG.exception(_LE('Failed to record event: %s'), result)
                    error = storage.StorageUnknownWriteError(result)

        if self._refresh_on_write:
            self.conn.indices.refresh(index='%s_*' % self.index_name)
            while self.conn.cluster.pending_tasks(local=True)['tasks']:
                pass
        if error:
            raise error

Beispiel #7

0

Datei anzeigen

Datei: index-blogitems.py Projekt: peterbe/django-peterbecom

    def _index_all_blogitems(self):
        iterator = BlogItem.objects.all()
        category_names = dict((x.id, x.name) for x in Category.objects.all())
        categories = defaultdict(list)
        for e in BlogItem.categories.through.objects.all():
            categories[e.blogitem_id].append(category_names[e.category_id])

        es = connections.get_connection()
        report_every = 100
        count = 0
        doc_type_name = _get_doc_type_name(BlogItem)
        t0 = time.time()
        for success, doc in streaming_bulk(
            es,
            (m.to_search(all_categories=categories).to_dict(True) for m in iterator),
            index=settings.ES_BLOG_ITEM_INDEX,
            doc_type=doc_type_name,
        ):
            if not success:
                print("NOT SUCCESS!", doc)
            count += 1
            if not count % report_every:
                print(count)
        t1 = time.time()

        self.out("DONE Indexing {} blogitems in {} seconds".format(count, t1 - t0))

Beispiel #8

0

Datei anzeigen

Datei: sink.py Projekt: ProjectMeniscus/meniscus

def flush_to_es():
    """
    Flushes a stream of messages to elasticsearch using bulk flushing.
    Uses a generator to pull messages off the queue and passes this as an
     iterable to the streaming_bulk method.  streaming_bulk is also a generator
     that yields message data used for acking from the queue after they
     are flushed.
    :param bulk_size: the number of messages to flush at once to elasticsearch
    :param bulk_timeout:
    :return: length of time to wait for a message from queue
    """

    while True:

        try:
            es_client = es_handler.connection
            ack_list = list()
            actions = get_queue_stream(ack_list)
            bulker = es_helpers.streaming_bulk(
                es_client, actions, chunk_size=BULK_SIZE)
            _LOG.error("Post flush")

            for response in bulker:
                msg = ack_list.pop(0)
                msg_ok = response[0]

                if msg_ok:
                    msg.ack()

        except Exception as ex:
            _LOG.exception(ex)

Beispiel #9

0

Datei anzeigen

Datei: load.py Projekt: dkyos/dev-samples

def load_repo(client, path=None, index='git'):
    """
    Parse a git repository with all it's commits and load it into elasticsearch
    using `client`. If the index doesn't exist it will be created.
    """
    path = dirname(dirname(abspath(__file__))) if path is None else path
    repo_name = basename(path)
    repo = git.Repo(path)

    create_git_index(client, index)

    # we let the streaming bulk continuously process the commits as they come
    # in - since the `parse_commits` function is a generator this will avoid
    # loading all the commits into memory
    for ok, result in streaming_bulk(
            client,
            parse_commits(repo.refs.master.commit, repo_name),
            index=index,
            doc_type='doc',
            chunk_size=50 # keep the batch sizes small for appearances only
        ):
        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (index, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)

Beispiel #10

0

Datei anzeigen

Datei: insert-es.py Projekt: TylerJFisher/ooni-pipeline

def load_report(client, report_path):
    client.create(
        index='report',
        doc_type='header',
        body={
            "mappings": {
                "header": {
                    "_timestamp": {
                        "enabled": True,
                        "type": "date",
                        "format": "yyyy-MM-dd HH:mm:ss",
                        "store": True,
                        "path": "timestamp"
                    },
                    "properties": header_properties
                }
            }
        },
        ignore=409  # 409 - conflict
    )
    for ok, result in streaming_bulk(
            client,
            parse_report(report_path),
            index="report",
            doc_type="header"):
        action, result = result.popitem()
        doc_id = '/report/%s' % (result['_id'])
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)
        client.indices.refresh(index='report')

Beispiel #11

0

Datei anzeigen

Datei: test_helpers.py Projekt: elastic/elasticsearch-py

 def test_actions_remain_unchanged(self):
     actions = [{"_id": 1}, {"_id": 2}]
     for ok, item in helpers.streaming_bulk(
         self.client, actions, index="test-index"
     ):
         self.assertTrue(ok)
     self.assertEquals([{"_id": 1}, {"_id": 2}], actions)

Beispiel #12

0

Datei anzeigen

Datei: impl_elasticsearch.py Projekt: AVerma87/ceilometer

    def record_events(self, events):

        def _build_bulk_index(event_list):
            for ev in event_list:
                traits = {t.name: t.value for t in ev.traits}
                yield {'_op_type': 'create',
                       '_index': '%s_%s' % (self.index_name,
                                            ev.generated.date().isoformat()),
                       '_type': ev.event_type,
                       '_id': ev.message_id,
                       '_source': {'timestamp': ev.generated.isoformat(),
                                   'traits': traits,
                                   'raw': ev.raw}}

        problem_events = []
        for ok, result in helpers.streaming_bulk(
                self.conn, _build_bulk_index(events)):
            if not ok:
                __, result = result.popitem()
                if result['status'] == 409:
                    problem_events.append((models.Event.DUPLICATE,
                                           result['_id']))
                else:
                    problem_events.append((models.Event.UNKNOWN_PROBLEM,
                                           result['_id']))

        if self._refresh_on_write:
            self.conn.indices.refresh(index='%s_*' % self.index_name)
            while self.conn.cluster.pending_tasks(local=True)['tasks']:
                pass
        return problem_events

Beispiel #13

0

Datei anzeigen

    def index(self, annotation_ids=None):
        """
        Reindex annotations.

        :param annotation_ids: a list of ids to reindex, reindexes all when `None`.
        :type annotation_ids: collection

        :returns: a set of errored ids
        :rtype: set
        """
        if not annotation_ids:
            annotations = _all_annotations(session=self.session,
                                           windowsize=PG_WINDOW_SIZE)
        else:
            annotations = _filtered_annotations(session=self.session,
                                                ids=annotation_ids)

        # Report indexing status as we go
        annotations = _log_status(annotations)

        indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations,
                                             chunk_size=ES_CHUNK_SIZE,
                                             raise_on_error=False,
                                             expand_action_callback=self._prepare)
        errored = set()
        for ok, item in indexing:
            if not ok:
                status = item[self.op_type]

                was_doc_exists_err = 'document already exists' in status['error']
                if self.op_type == 'create' and was_doc_exists_err:
                    continue

                errored.add(status['_id'])
        return errored

Beispiel #14

0

Datei anzeigen

Datei: csv_to_elasticsearch.py Projekt: elifaj/mysqldump-to-csv

def main():
    es = Elasticsearch([{'host' : sys.argv[1], 'port' : sys.argv[2]}])
    index = sys.argv[3]
    filenames = os.listdir('.')
    for f in filenames:
        if f.endswith('.csv'):
            for a, b in streaming_bulk(es, get_docs(f, index)):
                print a, b

Beispiel #15

0

Datei anzeigen

Datei: metrics.py Projekt: JJediny/cloud-custodian

 def index(self, points):
     for p in points:
         p['_index'] = self.config['indexer']['idx_name']
         p['_type'] = 'policy-metric'
     results = helpers.streaming_bulk(self.client, points)
     for status, r in results:
         if not status:
             log.debug("index err result %s", r)

Beispiel #16

0

Datei anzeigen

Datei: es.py Projekt: auto-mat/stupneprovozu

def django_import():
    # es.indices.delete(index='traffic', ignore=404)
    # TrafficReport.init()
    i = 0
    for ok, info in streaming_bulk(es, get_provoz(), doc_type="traffic_report", index="traffic"):
        i += 1
        if i % 1000 == 0:
            print (i, "dokumentu hotovo")

Beispiel #17

0

Datei anzeigen

Datei: tasks.py Projekt: CenterForOpenScience/SHARE

def index_model(self, model_name, ids, es_url=None, es_index=None):
    # TODO This method should not have to exist anymore
    es_client = Elasticsearch(es_url or settings.ELASTICSEARCH['URL'], retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'])
    action_gen = indexing.ElasticsearchActionGenerator([settings.ELASTICSEARCH['INDEX']], [indexing.FakeMessage(model_name, ids)])
    stream = helpers.streaming_bulk(es_client, (x for x in action_gen if x), max_chunk_bytes=10 * 1024 ** 2, raise_on_error=False)

    for ok, resp in stream:
        if not ok and not (resp.get('delete') and resp['delete']['status'] == 404):
            raise ValueError(resp)

Beispiel #18

0

Datei anzeigen

Datei: binsta.py Projekt: bkj/jinsta

def log_to_elasticsearch(data, params, client, index = 'test', doc_type = 'test', chunk_size = 10):
    try:
        data_gen = itertools.imap(lambda d: {"_index" : index, "_type" : doc_type, "_op_type" : "index", "_id" : d['id'], "source" : d}, data)
        for a, b in streaming_bulk(client, data_gen, chunk_size = chunk_size):
            pass
        
        return True
    except:
        return False

Beispiel #19

0

Datei anzeigen

Datei: test_helpers.py Projekt: luizgpsantos/elasticsearch-py

 def streaming_bulk():
     results = list(helpers.streaming_bulk(
         failing_client,
         [{"a": 42}, {"a": 39}],
         raise_on_exception=True,
         max_retries=3,
         initial_backoff=0
     ))
     return results

Beispiel #20

0

Datei anzeigen

Datei: es.py Projekt: filipealmeida/dataminion

 def index(self):
     response = helpers.streaming_bulk(self.es, self._actions, chunk_size=self._configuration["chunk_size"], raise_on_error=self._configuration["raise_on_error"], raise_on_exception=self._configuration["raise_on_exception"])
     for ok, result in response:
         action, result = result.popitem()
         doc_id = '/commits/%s' % (result['_id'])
         if not ok:
             self.logger.error("Failed to insert %s %s %s", action, doc_id, result)
         else:
             self.logger.warning("Success %d", ok)

Beispiel #21

0

Datei anzeigen

Datei: index.py Projekt: asyncee/django-el

    def index_documents(self):
        models = list(get_indexed_models())

        for model in models:
            self.save_mapping(model)

            model_instances = model.get_indexable().iterator()
            docs = (self.to_indexable_dict(d) for d in model_instances)
            for ok, info in streaming_bulk(self.es, docs):
                print("  Document with id %s indexed." % info['index']['_id'])

Beispiel #22

0

Datei anzeigen

Datei: import.py Projekt: lang-uk/names-ms

def bulk_load(docs_to_index):
    conn = connections.get_connection()
    index = NameVariant._doc_type.index

    for response in streaming_bulk(
            conn,
            docs_to_index,
            index=index,
            doc_type=NameVariant._doc_type.name):
        pass

Beispiel #23

0

Datei anzeigen

Datei: elastic2_doc_manager.py Projekt: mallegrini/elastic2-doc-manager

    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = u(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }

                parent_id = self._get_parent_id(doc_type, doc)
                if parent_id is not None:
                    document_action["_parent"] = parent_id
                    document_action["_source"] = self._formatter.format_document(doc)

                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

Beispiel #24

0

Datei anzeigen

Datei: test_helpers.py Projekt: elastic/elasticsearch-py

    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        for ok, item in helpers.streaming_bulk(
            self.client, docs, index="test-index", refresh=True
        ):
            self.assertTrue(ok)

        self.assertEquals(100, self.client.count(index="test-index")["count"])
        self.assertEquals(
            {"answer": 42}, self.client.get(index="test-index", id=42)["_source"]
        )

Beispiel #25

0

Datei anzeigen

    def index_all(cls, index_name, using=None, **kwargs):
        def actions_generator():
            for obj in cls.index_queryset().iterator():
                elastic_data = cls.from_django(obj).to_dict(include_meta=True)
                elastic_data['_index'] = index_name
                yield elastic_data

        client = connections.get_connection(using or cls._doc_type.using)
        cls.init(index_name)
        for ok, item in streaming_bulk(client, actions_generator(), chunk_size=100, **kwargs):
            yield ok, item

Beispiel #26

0

Datei anzeigen

Datei: models.py Projekt: talaveol/garnahata.in.ua

    def reindex(self):
        conn = connections.get_connection()
        docs_to_index = [
            ElasticAddress(**p.to_dict())
            for p in self]

        for response in streaming_bulk(
                conn, ({'_index': getattr(d.meta, 'index', d._doc_type.index),
                        '_type': d._doc_type.name,
                        '_source': d.to_dict()} for d in docs_to_index)):
            pass

Beispiel #27

0

Datei anzeigen

Datei: importer.py Projekt: despawnerer/theatrics

 def index_all(self, docs):
     actions = map(self.make_index_action, docs)
     bulk_results = streaming_bulk(
         self.elastic,
         actions,
         raise_on_error=False,
         raise_on_exception=False,
     )
     for is_successful, response in bulk_results:
         if not is_successful:
             print("Error indexing a document: %s" % str(response))

Beispiel #28

0

Datei anzeigen

def bulk_load(questions):
    all_ok = True
    es_questions = (q.as_elasticsearch_dict() for q in questions)
    for ok, result in streaming_bulk(get_client(),
                                     es_questions,
                                     index=settings.ES_INDEX,
                                     raise_on_error=False):
        if not ok:
            all_ok = False
            action, result = result.popitem()
            logger.error(FAILED_TO_LOAD_ERROR.format(result['_id'], result))
    return all_ok

Beispiel #29

0

Datei anzeigen

Datei: daemon.py Projekt: MLDSBigGuy/SHARE

    def _index_loop(self):
        try:
            while not self.should_stop:
                msgs = []
                actions = self._actions(250, msgs)
                tries = 0

                while not self.should_stop:
                    stream = helpers.streaming_bulk(
                        self.es_client,
                        actions,
                        max_chunk_bytes=self.MAX_CHUNK_BYTES,
                        raise_on_error=False,
                    )

                    start = time.time()
                    try:
                        for (ok, resp), msg in zip(stream, msgs):
                            if not ok and not (resp.get('delete')
                                               and resp['delete']['status']
                                               == 404):
                                raise ValueError(ok, resp, msg)
                            assert len(resp.values()) == 1
                            _id = list(resp.values())[0]['_id']
                            assert msg.payload['ids'] == [
                                util.IDObfuscator.decode_id(_id)
                            ], '{} {}'.format(msg.payload,
                                              util.IDObfuscator.decode_id(_id))
                            msg.ack()
                        if len(msgs):
                            logger.info('%r: Indexed %d documents in %.02fs',
                                        self, len(msgs),
                                        time.time() - start)
                        else:
                            logger.debug('%r: Recieved no messages for %.02fs',
                                         self,
                                         time.time() - start)
                        break
                    except ConnectionTimeout:
                        if tries >= self.TIMEOUT_RETRIES:
                            raise
                        tries += 1
                        logger.warning(
                            'Connection to elasticsearch timed out. Trying again after %s sec...',
                            self.TIMEOUT_INTERVAL)
                        time.sleep(self.TIMEOUT_INTERVAL)
                        continue
        except Exception as e:
            client.captureException()
            logger.exception('%r: _index_loop encountered an unexpected error',
                             self)
            self.should_stop = True
            raise SystemExit(1)

Beispiel #30

0

Datei anzeigen

Datei: index_cloner.py Projekt: 1mdata/es-index-cloner

 def _copy_data(self):
     ss_kw = {}
     # sort
     if self.source_sort:
         ss_kw['sort'] = self.source_sort
     scroll = self.source_es.search(index=self.source_index,
                                    scroll='1m',
                                    search_type='scan',
                                    size=self.bulk_size,
                                    version=True,
                                    timeout='60s',
                                    **ss_kw)
     sid = scroll['_scroll_id']
     total_size = scroll['hits']['total']
     hits_size = total_size
     dealt_size = 0
     print("docs: " + str(total_size))
     self.logger.info("docs: " + str(total_size))
     suffix = '%(percent)d%% - %(index)d [%(elapsed_td)s / %(eta_td)s]'
     bar = ShadyBar("clone", suffix=suffix, max=total_size)
     while (hits_size > 0):
         scroll = self.source_es.scroll(scroll_id=sid, scroll='1m')
         sid = scroll['_scroll_id']
         hits = scroll['hits']['hits']
         hits_size = len(hits)
         actions = self._bulk_hits(hits)
         if (len(actions) > 0):
             kw = {}
             kw['timeout'] = '60s'
             res = []
             try:
                 res = streaming_bulk(client=self.target_es,
                                      actions=actions,
                                      **kw)
             except BulkIndexError as err:
                 print(err)
                 pass
             okNum = 0
             for ok, re in res:
                 if not ok:
                     print(re)
                 else:
                     okNum += 1
             # refresh index
             if (okNum > 0):
                 self.target_es.indices.refresh(index=self.target_index)
         # dealt size
         dealt_size += hits_size
         bar.goto(dealt_size)
         self.logger.info("dealt: " + str(dealt_size) + " / " +
                          str(total_size))
     print('\nDone !')
     self.logger.info("Done ! \n\n")

Beispiel #31

0

Datei anzeigen

Datei: eli5_utils.py Projekt: azraar/nlp_summarization

def make_es_index_snippets(es_client,
                           passages_dset,
                           index_name="english_wiki_kilt_snippets_100w"):
    index_config = {
        "settings": {
            "number_of_shards": 1,
            "analysis": {
                "analyzer": {
                    "stop_standard": {
                        "type": "standard",
                        " stopwords": "_english_"
                    }
                }
            },
        },
        "mappings": {
            "properties": {
                "article_title": {
                    "type": "text",
                    "analyzer": "standard",
                    "similarity": "BM25"
                },
                "section_title": {
                    "type": "text",
                    "analyzer": "standard",
                    "similarity": "BM25"
                },
                "passage_text": {
                    "type": "text",
                    "analyzer": "standard",
                    "similarity": "BM25"
                },
            }
        },
    }
    es_client.indices.create(index=index_name, body=index_config)
    number_of_docs = passages_dset.num_rows
    progress = tqdm(unit="docs", total=number_of_docs)
    successes = 0

    def passage_generator():
        for passage in passages_dset:
            yield passage

    # create the ES index
    for ok, action in streaming_bulk(
            client=es_client,
            index=index_name,
            actions=passage_generator(),
    ):
        progress.update(1)
        successes += ok
    print("Indexed %d documents" % (successes, ))

Beispiel #32

0

Datei anzeigen

    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        for ok, item in helpers.streaming_bulk(self.client,
                                               docs,
                                               index="test-index",
                                               refresh=True):
            self.assertTrue(ok)

        self.assertEquals(100, self.client.count(index="test-index")["count"])
        self.assertEquals({"answer": 42},
                          self.client.get(index="test-index",
                                          id=42)["_source"])

Beispiel #33

0

Datei anzeigen

    def load_data(self, filepath):
        """
        loads data from event to target
        :returns: `bool` of status result
        """

        self.filepath = Path(filepath)

        # set class variables from filename
        self.parse_filename()

        inserts = 0
        updates = 0
        noops = 0
        fails = 0

        LOGGER.debug('Received file {}'.format(self.filepath))
        chunk_size = 80000

        # check for shapefile dependencies
        if self.check_shapefile_deps():

            # deactivate old forecasts for current storm name
            self.deactivate_old_forecasts()

            # generate geojson features
            package = self.generate_geojson_features()
            for ok, response in helpers.streaming_bulk(self.ES,
                                                       package,
                                                       chunk_size=chunk_size,
                                                       request_timeout=30):
                status = response['update']['result']

                if status == 'created':
                    inserts += 1
                elif status == 'updated':
                    updates += 1
                elif status == 'noop':
                    noops += 1
                else:
                    LOGGER.warning('Unhandled status code {}'.format(status))

            total = inserts + updates + noops + fails
            LOGGER.info('Inserted package of {} hurricane {} ({} inserts,'
                        ' {} updates, {} no-ops, {} rejects)'.format(
                            total, self.storm_variable, inserts, updates,
                            noops, fails))
            return True

        else:
            LOGGER.debug("All Shapefile dependencies not found. Ignoring "
                         "file...")
            return False

Beispiel #34

0

Datei anzeigen

Datei: models.py Projekt: talaveol/garnahata.in.ua

    def reindex(self):
        conn = connections.get_connection()
        docs_to_index = [
            ElasticOwnership(**p.to_dict(include_address=True,
                             include_name_alternatives=True))
            for p in self]

        for response in streaming_bulk(
                conn, ({'_index': getattr(d.meta, 'index', d._doc_type.index),
                        '_type': d._doc_type.name,
                        '_source': d.to_dict()} for d in docs_to_index)):
            pass

Beispiel #35

0

Datei anzeigen

Datei: gt_doc_manager.py Projekt: bigodines/mongo-connector

    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)

                if (doc['created_at']):
                    print doc['created_at']

                doc_id = u(doc.pop("_id"))
                document_action = {
                    '_index': index,
                    '_type': doc_type,
                    '_id': doc_id,
                    '_source': self._formatter.format_document(doc)
                }
                document_meta = {
                    '_index': self.meta_index_name,
                    '_type': self.meta_type,
                    '_id': doc_id,
                    '_source': {
                        'ns': namespace,
                        '_ts': timestamp
                    }
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

Beispiel #36

0

Datei anzeigen

Datei: esindex.py Projekt: alphagov/search-performance-dashboard

 def add(self, docs):
     if not self.es.indices.exists(self.index_name):
         self.create_index()
     count = 0
     for result in streaming_bulk(
         self.es,
         docs,
         raise_on_error=True,
         index=self.index_name
     ):
         count += 1
     logger.info("Added %d docs", count)

Beispiel #37

0

Datei anzeigen

Datei: elasticsearch_.py Projekt: tom-cooney/msc-pygeoapi

    def submit_elastic_package(self, package, request_size=10000):
        """
        helper function to send an update request to Elasticsearch and
        log the status of the request. Returns True if the upload succeeded.

        :param package: Iterable of bulk API update actions.
        :param request_size: Number of documents to upload per request.

        :returns: `bool` of whether the operation was successful.
        """

        inserts = 0
        updates = 0
        noops = 0
        errors = []

        try:
            for ok, response in streaming_bulk(
                    self.Elasticsearch,
                    package,
                    chunk_size=request_size,
                    request_timeout=MSC_PYGEOAPI_ES_TIMEOUT,
                    raise_on_error=False,
            ):
                if not ok:
                    errors.append(response)
                else:
                    status = response['update']['result']

                    if status == 'created':
                        inserts += 1
                    elif status == 'updated':
                        updates += 1
                    elif status == 'noop':
                        noops += 1
                    else:
                        LOGGER.error('Unhandled status code {}'.format(status))
                        errors.append(response)
        except BulkIndexError as err:
            LOGGER.error('Unable to perform bulk insert due to: {}'.format(
                err.errors))
            return False

        total = inserts + updates + noops
        LOGGER.info('Inserted package of {} documents ({} inserts, {} updates,'
                    ' {} no-ops)'.format(total, inserts, updates, noops))

        if len(errors) > 0:
            LOGGER.warning('{} errors encountered in bulk insert: {}'.format(
                len(errors), errors))
            return False

        return True

Beispiel #38

0

Datei anzeigen

Datei: gitter_content_indexer_es.py Projekt: scijava/gitter-content-indexer

def index_messages(indexed_messages, messages):
    num_messages = len(messages)
    successes = 0
    for ok, action in streaming_bulk(client=client,
                                     index=index_name,
                                     actions=extract_es_messages(
                                         indexed_messages, messages)):
        successes += ok
    if (successes != num_messages):
        print('Warning!: only %d/%d messages were indexed' %
              (successes, num_messages))
    print('Processed ' + str(len(messages)) + ' messages')

Beispiel #39

0

Datei anzeigen

Datei: _elasticsearch_helpers.py Projekt: stmosher/mindmeld

def version_compatible_streaming_bulk(
    es_client, docs, index, chunk_size, raise_on_error, doc_type
):

    if is_es_version_7(es_client):
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )
    else:
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            doc_type=doc_type,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )

Beispiel #40

0

Datei anzeigen

Datei: ESToMsql.py Projekt: LxiaoGirl/MysqlToElasticsearch

def bulk_elasticsearch(r_queue, w_lock, dbs, db_name):
    ES_LOGGER.info("Bulk Host: %s DB: %s Start" % (dbs["db_host"], db_name))
    es = Elasticsearch(dbs["es_colony"], retry_on_timeout=True, max_retries=3, timeout=3600)
    flag = True
    bulks = []
    data_lines_number = 0
    bulk_length = 0
    while flag:
        while not r_queue.empty():
            if bulk_length == 0:
                w_lock.acquire()
            data = r_queue.get()
            data_lines_number += 1
            bulk_length += 1
            if bulk_length >= BULK_LENGTH or r_queue.empty():
                w_lock.release()
            if isinstance(data, str) and data == "False":
                try:
                    ES_LOGGER.info("Bulk Host: %s DB: %s Data: %s" % (dbs["db_host"], db_name, bulk_length))
                    streaming_bulks = helpers.streaming_bulk(es, bulks, chunk_size=len(bulks))
                    for streaming_bulk in streaming_bulks:
                        if streaming_bulk[0]:
                            pass
                    bulks = []
                except Exception, e:
                    ES_LOGGER.warning(e)
                flag = False
                break
            bulks.append({"_index": dbs["index"], "_type": dbs["doc_type"], "_source": data})
            if bulk_length >= BULK_LENGTH:
                try:
                    ES_LOGGER.info("Bulk Host: %s DB: %s Data: %s" % (dbs["db_host"], db_name, data_lines_number))
                    streaming_bulks = helpers.streaming_bulk(es, bulks, chunk_size=len(bulks))
                    for streaming_bulk in streaming_bulks:
                        if streaming_bulk[0]:
                            pass
                    bulks = []
                    bulk_length = 0
                except Exception, e:
                    ES_LOGGER.warning("Bulk Error! %s", e)

Beispiel #41

0

Datei anzeigen

Datei: upload_offline.py Projekt: keeleleek/karp-backend

def upload(informat, name, order, data, elastic, index, typ, sql=False,
           verbose=True, with_id=False):
    """ Uploads the data to elastic and the database
        sql      if True,  the data will be stored in the SQL data base as well
                 as ElasticSearch
                 if False, the data will only be stored in ElasticSearch
        informat can either be xml  - lmf
                               json - a single json object or a list of objects
                               bulk - a list of json objects annotated with
                                      index and type information, as accepted
                                      by ElasticSearch
    """
    try:
        # The actual parsing
        data = parse_upload(informat, name, order, data, index, typ,
                            with_id=with_id)
    except Exception:
        print 'Error while reading data from %s' % name
        raise

    ok = 0
    if sql:
        # stream entries one by one to elastic, then update sql db
        # streaming_bulk will notify us at once when an entry fails
        sql_bulk = []
        for res in helpers.streaming_bulk(elastic, data):
            # res is a tuple, res[0]==True
            ansname = 'index' if with_id else 'create'
            _id = res[1].get(ansname).get('_id')
            source = data[ok].get('_source')
            if isinstance(source, dict):
                source = json.dumps(source)
            sql_bulk.append((_id, source, 'admin',
                             'entry automatically added or reloaded', name,
                             'imported'))
            ok += 1
        db_loaded, db_error = db.update_bulk(name, sql_bulk)
        if db_error:
            raise Exception(db_error)
        ok += db_loaded
    else:
        # upload all at once to elastic
        ok, err = helpers.bulk(elastic, data)
        if err:
            msg = "Error during upload. %s documents successfully uploaded. \
                   Message: %s.\n"
            raise Exception(msg % (ok, '\n'.join(err)))
    if not ok:
        raise Exception("No data")
        print >> sys.stderr, "Warning. 0 documents uploaded\n"
    if verbose:
        print "Ok. %s documents uploaded\n" % ok

Beispiel #42

0

Datei anzeigen

    def _bulk(
        self,
        index: str,
        docs: Generator,
        chunk_size: int,
        max_chunk_bytes: int,
        queue_size: int,
        thread_count: int,
        refresh: bool,
        max_retries: int,
        initial_backoff: int,
        max_backoff: int,
        raise_on_exception: bool,
        raise_on_error: bool,
    ):
        """Bulk index, update, delete docs to Elasticsearch."""

        # when using multiple threads for poll_db we need to account for other
        # threads performing deletions
        ignore_status: Tuple[int] = (400, 404)

        if ELASTICSEARCH_STREAMING_BULK:
            for _ in helpers.streaming_bulk(
                    self.__es,
                    docs,
                    index=index,
                    chunk_size=chunk_size,
                    max_chunk_bytes=max_chunk_bytes,
                    max_retries=max_retries,
                    max_backoff=max_backoff,
                    initial_backoff=initial_backoff,
                    refresh=refresh,
                    raise_on_exception=raise_on_exception,
                    raise_on_error=raise_on_error,
            ):
                self.doc_count += 1
        else:
            # parallel bulk consumes more memory and is also more likely
            # to result in 429 errors.
            for _ in helpers.parallel_bulk(
                    self.__es,
                    docs,
                    thread_count=thread_count,
                    chunk_size=chunk_size,
                    max_chunk_bytes=max_chunk_bytes,
                    queue_size=queue_size,
                    refresh=refresh,
                    raise_on_exception=raise_on_exception,
                    raise_on_error=raise_on_error,
                    ignore_status=ignore_status,
            ):
                self.doc_count += 1

Beispiel #43

0

Datei anzeigen

Datei: elastic7_doc_manager.py Projekt: ShenSteve/mongo-connector

    def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support drop database.")
            # dbs = self.command_helper.map_db(db)
            # for _db in dbs:
            #     self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                # Elasticsearch 7 remove type concept, need map to MongoDB table name to ES index
                index = '{db}_{tb}'.format(db=db.lower(), tb=coll)
                # self.elastic.indices.put_mapping(
                #     index=index, doc_type='_doc', body={"_source": {"enabled": True}}
                # )
                # self.elastic.indices.put_mapping(
                #     index=index, body={"_source" : {"enabled" : True}}
                # )

                # by pass table creation in mongodb, ES' index creation is in lazy module.
                warnings.warn(
                    "by pass table creation in mongodb, in ES7' index creation is as lazy module. %s on index %s."
                    % (coll, db))

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                # Elasticsearch 7 remove type concept, need map to MongoDB table name to ES index
                index = '{db}_{tb}'.format(db=db.lower(), tb=coll)
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type="delete") for result in scan(
                        self.elastic, index=index, doc_type='_doc')),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)

Beispiel #44

0

Datei anzeigen

Datei: base.py Projekt: inspirehep/inspirehep

    def bulk_index(self,
                   records_uuids,
                   request_timeout=None,
                   max_chunk_bytes=None):
        """Starts bulk indexing for specified records

        Args:
            records_uuids(list[str): List of strings which are UUID's of records
                to reindex
            request_timeout(int): Maximum time after which es will throw an exception

        Returns:
            dict: dict with success count and failure list
                (with uuids of failed records)

        """
        if not request_timeout:
            request_timeout = current_app.config[
                "INDEXER_BULK_REQUEST_TIMEOUT"]
        max_chunk_bytes = max_chunk_bytes or 100 * 1014 * 1024  # default ES setting
        result = streaming_bulk(
            es,
            self.bulk_iterator(records_uuids),
            request_timeout=request_timeout,
            raise_on_error=False,
            raise_on_exception=False,
            expand_action_callback=(_es7_expand_action),
            max_retries=5,  # Retires on Error 429
            initial_backoff=10,  # wait for initial_backoff * 2^retry_number,
            max_chunk_bytes=max_chunk_bytes,
        )

        failures = []
        for action_success, action_data in result:
            if not action_success:
                failures.append({
                    "status_code":
                    action_data["index"]["status"],
                    "error_type":
                    str(get_value(action_data, "index.error.type", "")),
                    "falure_reason":
                    str(get_value(action_data, "index.error.reason", "")),
                })

        number_of_failures = len(failures)

        return {
            "uuids": records_uuids,
            "success_count": len(records_uuids) - number_of_failures,
            "failures_count": number_of_failures,
            "failures": failures,
        }

Beispiel #45

0

Datei anzeigen

Datei: elastic2_doc_manager.py Projekt: nagarajanSnapwiz/mongo-connector-debug

    def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            if "Groups" in namespace:
                LOG.error("DEBUGG:: es bulk upsert groups _ids: %s" %
                          [x.get("_id") for x in docs])
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": namespace,
                        "_ts": timestamp
                    },
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    LOG.error("Could not bulk-upsert document "
                              "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

Beispiel #46

0

Datei anzeigen

Datei: events.py Projekt: kodless/leek

def merge_events(index_alias, events: List[Dict]):
    connection = es.connection
    try:
        # Index
        payload_length = len(events)
        index_start_time = time.time()
        actions = build_actions(events)
        updated, errors = [], []
        success, failed = 0, 0
        for ok, item in streaming_bulk(connection,
                                       actions,
                                       index=index_alias,
                                       _source=True):
            if not ok:
                errors.append(item)
                failed += 1
            else:
                updated.append(item["update"]["get"]["_source"])
                success += 1
        index_spent = time.time() - index_start_time
        logger.debug(
            f"--- Indexed {payload_length} in {index_spent} seconds, "
            f"Index latency: {(index_spent / payload_length) * 1000}ms ---")
        # Finalize
        if not failed:
            fanout(updated)
            return {"success": success}, 201
        else:
            return {
                "success": success,
                "failed": failed,
                "errors": errors
            }, 400
    except es_exceptions.ConnectionError:
        return responses.search_backend_unavailable
    except es_exceptions.RequestError as e:
        logger.error(e.info)
        return f"Request error", 409
    except bulk_errors.BulkIndexError as e:
        ignorable_errors = ["max_bytes_length_exceeded_exception"]
        for error in e.errors:
            try:
                err = error["update"]["error"]["caused_by"]["type"]
                if err in ignorable_errors:
                    logger.warning(
                        f"Payload caused an error {err} and leek did not index it!"
                    )
                    return "Processed", 201
            except KeyError:
                pass
        logger.error(e.errors)
        return f"Bulk update error", 409

Beispiel #47

0

Datei anzeigen

Datei: test_helpers.py Projekt: spotrad/elasticsearch-py

    def test_transport_error_can_becaught(self):
        failing_client = FailingBulkClient(self.client)
        docs = [
            {
                "_index": "i",
                "_type": "_doc",
                "_id": 47,
                "f": "v"
            },
            {
                "_index": "i",
                "_type": "_doc",
                "_id": 45,
                "f": "v"
            },
            {
                "_index": "i",
                "_type": "_doc",
                "_id": 42,
                "f": "v"
            },
        ]

        results = list(
            helpers.streaming_bulk(
                failing_client,
                docs,
                raise_on_exception=False,
                raise_on_error=False,
                chunk_size=1,
            ))
        self.assertEquals(3, len(results))
        self.assertEquals([True, False, True], [r[0] for r in results])

        exc = results[1][1]["index"].pop("exception")
        self.assertIsInstance(exc, TransportError)
        self.assertEquals(599, exc.status_code)
        self.assertEquals(
            {
                "index": {
                    "_index": "i",
                    "_type": "_doc",
                    "_id": 45,
                    "data": {
                        "f": "v"
                    },
                    "error": "TransportError(599, 'Error!')",
                    "status": 599,
                }
            },
            results[1][1],
        )

Beispiel #48

0

Datei anzeigen

def index_documents(path: str, name: str):
    """Use the streaming bulk API to index some documents"""
    # TODO: inject hostname
    es = Elasticsearch(hosts=[{'host': 'localhost', 'port': 49200}])
    for ok, result in streaming_bulk(es, file_iterable(path, name)):
        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (name, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)

Beispiel #49

0

Datei anzeigen

def bulk_load(movies):
    all_ok = True
    es_movies = (q.as_elasticsearch_dict() for q in movies)
    for ok, result in streaming_bulk(get_client(),
                                     es_movies,
                                     index=settings.ES_INDEX,
                                     raise_on_error=False):
        #in this step for loop will log any error that occurs while loading the movie
        if not ok:
            all_ok = False
            action, result = result.popitem()
            logger.error(FAILED_TO_LOAD_ERROR.format(result['_id'], result))
    return all_ok

Beispiel #50

0

Datei anzeigen

 def bulk_operation(cls, index=None, client=None, **options):
     for ok, result in streaming_bulk(
             client or cls.client,
             cls._bulk_stream(**options),
             index=index or cls.document._default_index(),
             raise_on_error=False,
             yield_ok=False,
             chunk_size=cls.data_bulk_limit
     ):
         if not ok:
             action, result = result.popitem()
             doc_id = '/%s/_doc/%s' % (index, result['_id'])
             logger.warning('Failed to {} document {}: {}'.format(action, doc_id, result))

Beispiel #51

0

Datei anzeigen

    def index(self) -> bool:
        if not self.get_available_fields().count():
            self.task.info(self.task, "No hay series para indexar en este catálogo")
            return False

        index_ok = False
        for success, info in streaming_bulk(self.elastic, self.generate_actions()):
            if not success:
                self.task.info(self.task, 'Error indexando: {}'.format(info))
            else:
                index_ok = True

        return index_ok

Beispiel #52

0

Datei anzeigen

Datei: es_etl_helpers.py Projekt: Android-Recommended-Enterprise/usaspending-api

def streaming_post_to_es(client, chunk, index_name, job_id=None, doc_type="transaction_mapping"):
    success, failed = 0, 0
    try:
        for ok, item in helpers.streaming_bulk(client, chunk, index=index_name, doc_type=doc_type):
            success = [success, success + 1][ok]
            failed = [failed + 1, failed][ok]

    except Exception as e:
        print("MASSIVE FAIL!!!\n\n{}\n\n{}".format(str(e)[:5000], "*" * 80))
        raise SystemExit(1)

    printf({"msg": "Success: {}, Fails: {}".format(success, failed), "job": job_id, "f": "ES Ingest"})
    return success, failed

Beispiel #53

0

Datei anzeigen

def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore):
    index_elastic_search = ElasticSearcher(
        indices=indices,
        field_data=index_fields,
        callback_progress=show_progress,
        query=query,
        output=ElasticSearcher.OUT_RAW,
        scroll_size=scroll_size
    )
    index_actions = add_doc_uuid(generator=index_elastic_search)
    for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3):
        if not success:
            logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))

Beispiel #54

0

Datei anzeigen

Datei: ensembl_regbuild.py Projekt: uludag/nosql-biosets

def es_index(es, index, gffdb, reader, doctype):
    checkindex(es, index)
    for ok, result in streaming_bulk(es,
                                     reader(gffdb),
                                     index=index,
                                     doc_type=doctype,
                                     chunk_size=chunksize):
        if not ok:
            action, result = result.popitem()
            doc_id = '/%s/commits/%s' % (args.index, result['_id'])
            print('Failed to %s document %s: %r' % (action, doc_id, result))
    es.indices.refresh(index=index)
    return

Beispiel #55

0

Datei anzeigen

Datei: tests.py Projekt: RockefellerArchiveCenter/argo

 def index_fixture_data(self, source_filepath, doc_cls):
     added_ids = []
     for ok, result in streaming_bulk(self.connection,
                                      self.prepare_data(
                                          source_filepath, doc_cls),
                                      refresh=True):
         action, result = result.popitem()
         if not ok:
             raise Exception("Failed to {} document {}: {}".format(
                 action, result["_id"], result))
         else:
             added_ids.append(result["_id"])
     return added_ids

Beispiel #56

0

Datei anzeigen

Datei: import_variants.py Projekt: sbiyyala/variant-search

def load(client, path='variants.tsv', index='variants'):
    create_variants_index(client, index)
    for ok, result in streaming_bulk(client,
                                     gen_variants(path),
                                     index=index,
                                     doc_type='variant',
                                     chunk_size=100):

        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (index, result['_id'])
        if not ok:
            raise Exception('Failed to %s document %s: %r' %
                            (action, doc_id, result))

Beispiel #57

0

Datei anzeigen

Datei: zad2_prep.py Projekt: nagrael/pjn

def create_docs(client, items):
    success, failed = 0, 0
    for ok, result in streaming_bulk(client,
                                     items,
                                     index="judgment",
                                     doc_type="doc",
                                     max_retries=5,
                                     chunk_size=250):
        if not ok:
            failed += 1
        else:
            success += 1
    print(f"Created {success} indexes, with failed {failed}.")

Beispiel #58

0

Datei anzeigen

    def index(self):
        """send csv to ES index"""
        self.logger.info('Setting up Elasticsearch index...')
        elastic = Elasticsearch(host=self.host, port=self.port, timeout=10000)
        try:
            self.logger.info('Creating index %s...' % self.index_name)
            elastic.indices.create(self.index_name, self.mapping)
        except RequestError:
            self.logger.info('Index already exists, skipping...')

        self.logger.info('Indexing %s...' % self.file)
        act = (self.format(choices, cid=cid) for cid, choices in self.csv_generator())
        list(streaming_bulk(elastic, actions=act))

Beispiel #59

0

Datei anzeigen

Datei: lib.py Projekt: olekstomek/mcod-backend-dane.gov.pl

 def save_es_actions(self, datasets_updates):
     dataset_model = apps.get_model('datasets.Dataset')
     for dataset_id, data in datasets_updates.items():
         dataset_model.objects.filter(pk=dataset_id).update(**data)
         try:
             self.views_es_actions['datasets'].append({
                 '_op_type':
                 'update',
                 '_index':
                 settings.ELASTICSEARCH_INDEX_NAMES['datasets'],
                 '_type':
                 'doc',
                 '_id':
                 dataset_id,
                 'doc':
                 data
             })
         except KeyError:
             self.views_es_actions['datasets'] = [{
                 '_op_type':
                 'update',
                 '_index':
                 settings.ELASTICSEARCH_INDEX_NAMES['datasets'],
                 '_type':
                 'doc',
                 '_id':
                 dataset_id,
                 'doc':
                 data
             }]
     es_actions = []
     for view_actions in self.views_es_actions.values():
         es_actions.extend(view_actions)
     streaming_bulk(connections.get_connection(),
                    es_actions,
                    raise_on_error=False,
                    raise_on_exception=False,
                    max_retries=2)

Beispiel #60

0

Datei anzeigen

Datei: elastic_doc_manager.py Projekt: devopservices/mongo-connector

    def bulk_upsert(self, docs):
        """Insert multiple documents into Elasticsearch."""
        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index = doc.pop("ns")
                doc_id = u(doc.pop("_id"))
                timestamp = doc.pop("_ts")
                document_action = {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc)
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {
                        "ns": index,
                        "_ts": timestamp
                    }
                }
                yield document_action
                yield document_meta
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            kw = {}
            if self.chunk_size > 0:
                kw['chunk_size'] = self.chunk_size

            responses = streaming_bulk(client=self.elastic,
                                       actions=docs_to_upsert(),
                                       **kw)

            for ok, resp in responses:
                if not ok:
                    logging.error("Could not bulk-upsert document "
                                  "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass