Python bulk_index Examples, elasticsearch.helpers.bulk_index Python Examples

Example #1

0

Show file

File: snippet.py Project: suliuzh/dockerizeme

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s",
                               e)
                return

        prepped_docs = []

        for obj in iterable:
            prepped_data = index.full_prepare(obj)
            final_data = {}

            # Convert the data to make sure it's happy.
            for key, value in prepped_data.items():
                final_data[key] = self._from_python(value)
            final_data['_type'], final_data['_id'] = self.get_type_and_id(obj)

            del final_data['id']

            prepped_docs.append(final_data)

        bulk_index(self.conn, prepped_docs, index=self.index_name)

        if commit:
            self.conn.indices.refresh(index=self.index_name)

Example #2

0

Show file

File: tasks.py Project: Osmose/olympia

def index_collection_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = CollectionCount.objects.filter(collection__in=ids)

    if qs.exists():
        log.info('Indexing %s addon collection counts: %s'
                 % (qs.count(), qs[0].date))

    data = []
    try:
        for collection_count in qs:
            collection = collection_count.collection_id
            filters = dict(collection=collection,
                           date=collection_count.date)
            data.append(search.extract_addon_collection(
                collection_count,
                AddonCollectionCount.objects.filter(**filters),
                CollectionStats.objects.filter(**filters)))
        bulk_index(es, data, index=index,
                   doc_type=CollectionCount.get_mapping_type(),
                   refresh=True)
    except Exception, exc:
        index_collection_counts.retry(args=[ids], exc=exc)
        raise

Example #3

0

Show file

File: tasks.py Project: vishalm-cuelogic/addons-server

def index_collection_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = CollectionCount.objects.filter(collection__in=ids)
    if qs:
        log.info('Indexing %s addon collection counts: %s' %
                 (qs.count(), qs[0].date))
    data = []
    try:
        for collection_count in qs:
            collection = collection_count.collection_id
            filters = dict(collection=collection, date=collection_count.date)
            data.append(
                search.extract_addon_collection(
                    collection_count,
                    AddonCollectionCount.objects.filter(**filters),
                    CollectionStats.objects.filter(**filters)))
        bulk_index(es,
                   data,
                   index=index,
                   doc_type=CollectionCount.get_mapping_type(),
                   refresh=True)
    except Exception, exc:
        index_collection_counts.retry(args=[ids], exc=exc)
        raise

Example #4

0

Show file

File: indexes.py Project: vamsi/readthedocs.org

    def bulk_index(self,
                   data,
                   index=None,
                   chunk_size=500,
                   parent=None,
                   routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {
                '_index': index,
                '_type': self._type,
                '_id': source['id'],
                '_source': source,
            }
            if parent:
                doc['_parent'] = parent
            if routing:
                doc['_routing'] = routing
            docs.append(doc)

        bulk_index(self.es, docs, chunk_size=chunk_size)

Example #5

0

Show file

File: indexes.py Project: 1suming/readthedocs.org

    def bulk_index(self, data, index=None, chunk_size=500, parent=None,
                   routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {
                '_index': index,
                '_type': self._type,
                '_id': source['id'],
                '_source': source,
            }
            if parent:
                doc['_parent'] = parent
            if routing:
                doc['_routing'] = routing
            docs.append(doc)

        bulk_index(self.es, docs, chunk_size=chunk_size)

Example #6

0

Show file

File: esindex.py Project: 17zuoye/luigi

    def run(self):
        """
        Run task, namely:

        * purge existing index, if requested (`purge_existing_index`),
        * create the index, if missing,
        * apply mappings, if given,
        * set refresh interval to -1 (disable) for performance reasons,
        * bulk index in batches of size `chunk_size` (2000),
        * set refresh interval to 1s,
        * refresh Elasticsearch,
        * create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = self._init_connection()
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk_index(es, self._docs(), chunk_size=self.chunk_size,
                   raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()

Example #7

0

Show file

File: esindex.py Project: su-admin/luigi

    def run(self):
        """
        Run task, namely:

        * purge existing index, if requested (`purge_existing_index`),
        * create the index, if missing,
        * apply mappings, if given,
        * set refresh interval to -1 (disable) for performance reasons,
        * bulk index in batches of size `chunk_size` (2000),
        * set refresh interval to 1s,
        * refresh Elasticsearch,
        * create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = self._init_connection()
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk_index(es, self._docs(), chunk_size=self.chunk_size,
                   raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()

Example #8

0

Show file

    def run(self):
        """ Purge existing index, if requested (`purge_existing_index`).
        Create the index, if missing. Apply mappings, if given.
        Set refresh interval to -1 (disable) for performance reasons.
        Bulk index in batches of size `chunk_size` (2000).
        Set refresh interval to 1s. Refresh Elasticsearch.
        Create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = elasticsearch.Elasticsearch([{'host': self.host,
                                           'port': self.port}],
                                         timeout=self.timeout)
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk_index(es, self._docs(), chunk_size=self.chunk_size,
                   raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()

Example #9

0

Show file

 def index(self, data):
     print('Start indexing batch of', len(data))
     bulk_index(es,
                data,
                index=self.ES_INDEX,
                doc_type='place',
                refresh=True)
     print('End indexing of current batch')

Example #10

0

Show file

File: index.py Project: shiveeg1/fjord

    def bulk_index(cls, docs=ALL_DOCS):
        es = get_es()

        if docs is ALL_DOCS:
            docs = [cls.get_doctype().extract_doc(obj) for obj in cls.get_indexable()]

        if not docs:
            return

        bulk_index(es, docs, index=get_index_name(), doc_type=cls.get_doctype()._doc_type.name, raise_on_error=True)

Example #11

0

Show file

File: search_helpers.py Project: thadguidry/engine

def index_all():
    es = get_es()

    resources = Resource.visible.all()
    documents = []

    for resource in resources:
        documents.append(extract_document(resource))

    bulk_index(es, documents, index=ELASTICSEARCH_INDEX_NAME, doc_type=ELASTICSEARCH_ENGINE_DOC_TYPE)

    es.indices.refresh(index=ELASTICSEARCH_INDEX_NAME)

Example #12

0

Show file

File: utils.py Project: meninoebom/bungiesearch

def update_index(model_items, model_name, action='index', bulk_size=100, num_docs=-1, start_date=None, end_date=None, refresh=True):
    '''
    Updates the index for the provided model_items.
    :param model_items: a list of model_items (django Model instances, or proxy instances) which are to be indexed, or updated.
    :param model_name: doctype, which must also be the model name.
    :param action: the action that you'd like to perform on this group of data. Must be in ('index', 'delete') and defaults to 'index.'
    :param bulk_size: bulk size for indexing. Defaults to 100.
    :param num_docs: maximum number of model_items from the provided list to be indexed.
    :param start_date: start date for indexing. Must be as YYYY-MM-DD.
    :param end_date: end date for indexing. Must be as YYYY-MM-DD.
    :param refresh: a boolean that determines whether to refresh the index, making all operations performed since the last refresh
    immediately available for search, instead of needing to wait for the scheduled Elasticsearch execution. Defaults to True.
    :note: If model_items contain multiple models, then num_docs is applied to *each* model. For example, if bulk_size is set to 5,
    and item contains models Article and Article2, then 5 model_items of Article *and* 5 model_items of Article2 will be indexed.
    '''
    src = Bungiesearch()

    logging.info('Getting index for model {}.'.format(model_name))
    for index_name in src.get_index(model_name):
        index_instance = src.get_model_index(model_name)
        model = index_instance.get_model()

        if num_docs == -1:
            if isinstance(model_items, (list, tuple)):
                num_docs = len(model_items)
            else:
                # Let's parse the start date and end date.
                if start_date or end_date:
                    if index_instance.updated_field is None:
                        raise ValueError('Cannot filter by date on model {}: no updated_field defined in {}\'s Meta class.'.format(model_name, index_instance.__class__.__name__))
                    if start_date:
                        model_items = model_items.filter(**{'{}__gte'.format(index_instance.updated_field): __str_to_tzdate__(start_date)})
                    if end_date:
                        model_items = model_items.filter(**{'{}__lte'.format(index_instance.updated_field): __str_to_tzdate__(end_date)})
                logging.info('Fetching number of documents to {} in {}.'.format(action, model.__name__))
                num_docs = model_items.count()
        else:
            logging.warning('Limiting the number of model_items to {} to {}.'.format(action, num_docs))

        logging.info('{} {} documents on index {}'.format(action, num_docs, index_name))
        prev_step = 0
        max_docs = num_docs + bulk_size if num_docs > bulk_size else bulk_size + 1
        for next_step in range(bulk_size, max_docs, bulk_size):
            logging.info('{}: documents {} to {} of {} total on index {}.'.format(action.capitalize(), prev_step, next_step, num_docs, index_name))
            data = [index_instance.serialize_object(doc) for doc in model_items[prev_step:next_step] if index_instance.matches_indexing_condition(doc)] 
            for entry in data:
                # Tell elasticsearch-py what to do with the data internally
                entry["_op_type"] = action
            bulk_index(src.get_es_instance(), data, index=index_name, doc_type=model.__name__, raise_on_error=True)
            prev_step = next_step

        if refresh:
            src.get_es_instance().indices.refresh(index=index_name)

Example #13

0

Show file

File: backends.py Project: PeRDy/haystack-elasticsearch

    def update(self, index, iterable, commit=True):
        """Update an index with a collection.

        :param index: Index to be updated.
        :type index: Index
        :param iterable: Objects to update the index.
        :type iterable: iterable
        :param commit: Commit changes.
        :type commit: bool
        """
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s", e)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)
                final_data['_id'] = final_data[ID]

                prepped_docs.append(final_data)
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        doc_type = get_model_ct(index.get_model())
        bulk_index(self.conn, prepped_docs, index=self.index_name, doc_type=doc_type)

        if commit:
            self.conn.indices.refresh(index=self.index_name)

Example #14

0

Show file

File: elasticsearch_backend.py Project: MaxOliveira/MakeupSecrets

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s",
                               e)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)
                final_data['_id'] = final_data[ID]

                prepped_docs.append(final_data)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__class__.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

        bulk_index(self.conn,
                   prepped_docs,
                   index=self.index_name,
                   doc_type='modelresult')

        if commit:
            self.conn.indices.refresh(index=self.index_name)

Example #15

0

Show file

    def index_all(self, index=None):
        # Determine which index we are indexing into
        _index = index if index is not None else self.index._index

        # Bulk Index our documents
        bulk_index(
            self.index.es,
            [{
                "_index": _index,
                "_type": self._type,
                "_id": self.extract_id(item),
                "_source": self.extract_document(item),
            } for item in self.get_indexable()],
        )

Example #16

0

Show file

File: utils.py Project: ChristopherRabotin/bungiesearch

def update_index(model_items, model_name, action='index', bulk_size=100, num_docs=-1, start_date=None, end_date=None, refresh=True):
    '''
    Updates the index for the provided model_items.
    :param model_items: a list of model_items (django Model instances, or proxy instances) which are to be indexed/updated or deleted.
    If action is 'index', the model_items must be serializable objects. If action is 'delete', the model_items must be primary keys
    corresponding to obects in the index.
    :param model_name: doctype, which must also be the model name.
    :param action: the action that you'd like to perform on this group of data. Must be in ('index', 'delete') and defaults to 'index.'
    :param bulk_size: bulk size for indexing. Defaults to 100.
    :param num_docs: maximum number of model_items from the provided list to be indexed.
    :param start_date: start date for indexing. Must be as YYYY-MM-DD.
    :param end_date: end date for indexing. Must be as YYYY-MM-DD.
    :param refresh: a boolean that determines whether to refresh the index, making all operations performed since the last refresh
    immediately available for search, instead of needing to wait for the scheduled Elasticsearch execution. Defaults to True.
    :note: If model_items contain multiple models, then num_docs is applied to *each* model. For example, if bulk_size is set to 5,
    and item contains models Article and Article2, then 5 model_items of Article *and* 5 model_items of Article2 will be indexed.
    '''
    src = Bungiesearch()

    if action == 'delete' and not hasattr(model_items, '__iter__'):
        raise ValueError("If action is 'delete', model_items must be an iterable of primary keys.")

    logger.info('Getting index for model {}.'.format(model_name))
    for index_name in src.get_index(model_name):
        index_instance = src.get_model_index(model_name)
        model = index_instance.get_model()

        if num_docs == -1:
            if isinstance(model_items, (list, tuple)):
                num_docs = len(model_items)
            else:
                model_items = filter_model_items(index_instance, model_items, model_name, start_date, end_date)
                num_docs = model_items.count()

                if not model_items.ordered:
                    model_items = model_items.order_by('pk')
        else:
            logger.warning('Limiting the number of model_items to {} to {}.'.format(action, num_docs))

        logger.info('{} {} documents on index {}'.format(action, num_docs, index_name))
        prev_step = 0
        max_docs = num_docs + bulk_size if num_docs > bulk_size else bulk_size + 1
        for next_step in range(bulk_size, max_docs, bulk_size):
            logger.info('{}: documents {} to {} of {} total on index {}.'.format(action.capitalize(), prev_step, next_step, num_docs, index_name))
            data = create_indexed_document(index_instance, model_items[prev_step:next_step], action)
            bulk_index(src.get_es_instance(), data, index=index_name, doc_type=model.__name__, raise_on_error=True)
            prev_step = next_step

        if refresh:
            src.get_es_instance().indices.refresh(index=index_name)

Example #17

0

Show file

File: indexer.py Project: lnielsen/asclepias-broker

def index_documents(docs, bulk=False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)

Example #18

0

Show file

File: indexer.py Project: slint/asclepias-broker

def index_documents(docs: Iterable[dict], bulk: bool = False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
            raise_on_error=False,
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)

Example #19

0

Show file

File: estestcase.py Project: delving/nave

    def index_data(cls, documents, id_field='id'):
        """Indexes specified data

        Uses ``cls.index_name`` as the index to index into.  Uses
        ``cls.mapping_type_name`` as the doctype to index these
        documents as.

        :arg documents: List of documents as Python dicts
        :arg id_field: The field of the document that represents the id

        """
        documents = (dict(d, _id=d[id_field]) for d in documents)
        bulk_index(cls.get_es(), documents, index=cls.index_name,
                   doc_type=cls.mapping_type_name)
        cls.refresh()

Example #20

0

Show file

File: estestcase.py Project: alibozorgkhan/elasticutils

    def index_data(cls, documents, index, doctype, id_field='id'):
        """Bulk indexes given data.

        This does a refresh after the data is indexed.

        :arg documents: list of python dicts each a document to index
        :arg index: name of the index
        :arg doctype: mapping type name
        :arg id_field: the field the document id is stored in in the
            document

        """
        documents = (dict(d, _id=d[id_field]) for d in documents)
        bulk_index(cls.get_es(), documents, index=index, doc_type=doctype)
        cls.refresh(index)

Example #21

0

Show file

def index_download_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = DownloadCount.objects.filter(id__in=ids)
    if qs:
        log.info('Indexing %s downloads for %s.' % (qs.count(), qs[0].date))
    try:
        data = []
        for dl in qs:
            data.append(search.extract_download_count(dl))
        bulk_index(es, data, index=index,
                   doc_type=DownloadCount.get_mapping_type(), refresh=True)
    except Exception, exc:
        index_download_counts.retry(args=[ids, index], exc=exc)
        raise

Example #22

0

Show file

    def test_errors_are_reported_correctly(self):
        self.client.indices.create(
            "i", {
                "mappings": {
                    "t": {
                        "properties": {
                            "a": {
                                "type": "integer"
                            }
                        }
                    }
                },
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                }
            })
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(self.client, [{
            "a": 42
        }, {
            "a": "c",
            '_id': 42
        }],
                                             index="i",
                                             doc_type="t")
        self.assertEquals(1, success)
        self.assertEquals(1, len(failed))
        error = failed[0]
        self.assertEquals('42', error['index']['_id'])
        self.assertEquals('t', error['index']['_type'])
        self.assertEquals('i', error['index']['_index'])
        self.assertIn('MapperParsingException', error['index']['error'])

Example #23

0

Show file

File: test_helpers.py Project: richardmcallister-wf/elasticsearch-py

    def test_all_documents_get_inserted(self):
        docs = [{"answer": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client, docs, index='test-index', doc_type='answers', refresh=True)

        self.assertEquals(100, len(success))
        self.assertFalse(failed)
        self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count'])

Example #24

0

Show file

def index_update_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = UpdateCount.objects.filter(id__in=ids)
    if qs:
        log.info('Indexing %s updates for %s.' % (qs.count(), qs[0].date))
    data = []
    try:
        for update in qs:
            data.append(search.extract_update_count(update))
        bulk_index(es, data, index=index,
                   doc_type=UpdateCount.get_mapping_type(), refresh=True)
    except Exception, exc:
        index_update_counts.retry(args=[ids, index], exc=exc, **kw)
        raise

Example #25

0

Show file

File: test_helpers.py Project: richardmcallister-wf/elasticsearch-py

    def test_stats_only_reports_numbers(self):
        docs = [{"answer": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client, docs, index='test-index', doc_type='answers', refresh=True, stats_only=True)

        self.assertEquals(100, success)
        self.assertEquals(0, failed)
        self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count'])

Example #26

0

Show file

    def test_errors_are_collected_properly(self):
        self.client.indices.create(
            "i", {
                "mappings": {
                    "t": {
                        "properties": {
                            "a": {
                                "type": "integer"
                            }
                        }
                    }
                },
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                }
            })
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(self.client, [{
            "a": 42
        }, {
            "a": "c"
        }],
                                             index="i",
                                             doc_type="t",
                                             stats_only=True)
        self.assertEquals(1, success)
        self.assertEquals(1, failed)

Example #27

0

Show file

def index_documents(docs: Iterable[dict], bulk: bool = False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
            raise_on_error=False,
            chunk_size=300,  # TODO: Make configurable
            max_chunk_bytes=(30 * 1024 * 1024),  # TODO: Make configurable
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)

Example #28

0

Show file

    def bulk_index(cls, docs=ALL_DOCS):
        es = get_es()

        if docs is ALL_DOCS:
            docs = [
                cls.get_doctype().extract_doc(obj)
                for obj in cls.get_indexable()
            ]

        if not docs:
            return

        bulk_index(es,
                   docs,
                   index=get_index_name(),
                   doc_type=cls.get_doctype()._doc_type.name,
                   raise_on_error=True)

Example #29

0

Show file

def index_update_counts(ids, index=None, **kw):
    index = index or UpdateCountIndexer.get_index_alias()

    es = amo_search.get_es()
    qs = UpdateCount.objects.filter(id__in=ids)
    if qs.exists():
        log.info('Indexing %s updates for %s.' % (qs.count(), qs[0].date))
    data = []
    try:
        for obj in qs:
            data.append(UpdateCountIndexer.extract_document(obj))
        bulk_index(es, data, index=index,
                   doc_type=UpdateCountIndexer.get_doctype_name(),
                   refresh=True)
    except Exception as exc:
        index_update_counts.retry(args=[ids, index], exc=exc, **kw)
        raise

Example #30

0

Show file

File: test_helpers.py Project: rboulton/elasticsearch-py

    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client, docs, index="test-index", doc_type="answers", refresh=True)

        self.assertEquals(100, success)
        self.assertFalse(failed)
        self.assertEquals(100, self.client.count(index="test-index", doc_type="answers")["count"])
        self.assertEquals({"answer": 42}, self.client.get(index="test-index", doc_type="answers", id=42)["_source"])

Example #31

0

Show file

File: estestcase.py Project: isabella232/elasticutils

    def index_data(cls, documents, id_field='id'):
        """Indexes specified data

        Uses ``cls.index_name`` as the index to index into.  Uses
        ``cls.mapping_type_name`` as the doctype to index these
        documents as.

        :arg documents: List of documents as Python dicts
        :arg id_field: The field of the document that represents the id

        """
        documents = (dict(d, _id=d[id_field]) for d in documents)
        bulk_index(cls.get_es(),
                   documents,
                   index=cls.index_name,
                   doc_type=cls.mapping_type_name)
        cls.refresh()

Example #32

0

Show file

File: indexes.py Project: Ivoz/warehouse

    def index_all(self, index=None):
        # Determine which index we are indexing into
        _index = index if index is not None else self.index._index

        # Bulk Index our documents
        bulk_index(
            self.index.es,
            [
                {
                    "_index": _index,
                    "_type": self._type,
                    "_id": self.extract_id(item),
                    "_source": self.extract_document(item),
                }
                for item in self.get_indexable()
            ],
        )

Example #33

0

Show file

File: tasks.py Project: aviarypl/mozilla-l10n-addons-server

def index_theme_user_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = ThemeUserCount.objects.filter(id__in=ids)

    if qs.exists():
        log.info('Indexing %s theme user counts for %s.'
                 % (qs.count(), qs[0].date))
    data = []

    try:
        for user_count in qs:
            data.append(search.extract_theme_user_count(user_count))
        bulk_index(es, data, index=index,
                   doc_type=ThemeUserCount.get_mapping_type(), refresh=True)
    except Exception as exc:
        index_theme_user_counts.retry(args=[ids], exc=exc, **kw)
        raise

Example #34

0

Show file

File: tasks.py Project: bqbn/addons-server

def index_theme_user_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = ThemeUserCount.objects.filter(id__in=ids)

    if qs.exists():
        log.info('Indexing %s theme user counts for %s.'
                 % (qs.count(), qs[0].date))
    data = []

    try:
        for user_count in qs:
            data.append(search.extract_theme_user_count(user_count))
        bulk_index(es, data, index=index,
                   doc_type=ThemeUserCount.get_mapping_type(), refresh=True)
    except Exception as exc:
        index_theme_user_counts.retry(args=[ids], exc=exc, **kw)
        raise

Example #35

0

Show file

File: populate_esnomodel_indices.py Project: shadowfax-chc/esutil-example

 def handle(self, *args, **options):
     documents = [
         {'id': 1,
          'title': 'No model searchable 1',
          'text': 'some text'},
         {'id': 2,
          'title': 'No model searchable 2',
          'text': 'blah'},
         {'id': 3,
          'title': 'No model searchable 3',
          'text': 'something'},
         {'id': 4,
          'title': 'No model searchable 4',
          'text': 'qwerty'},
         {'id': 5,
          'title': 'No model searchable 5',
          'text': 'hjkl'}
     ]
     es = get_es(ursl=settings.ES_URLS)
     bulk_index(es, documents, index=INDEX, doc_type=DOCTYPE)
     es.indices.refresh(index=INDEX)

Example #36

0

Show file

File: index.py Project: mgax/hambar109

 def bulk_add(self, documents):
     rv = helpers.bulk_index(
         client=self.es,
         docs=({
             '_id': doc_id,
             '_index': self.name,
             '_type': self.doc_type,
             '_source': data,
         } for doc_id, data in documents),
         raise_on_error=True,
     )
     self.es.indices.refresh(self.name)

Example #37

0

Show file

File: index.py Project: mgax/hambar109

 def bulk_add(self, documents):
     rv = helpers.bulk_index(
         client=self.es,
         docs=({
             '_id': doc_id,
             '_index': self.name,
             '_type': self.doc_type,
             '_source': data,
         } for doc_id, data in documents),
         raise_on_error=True,
     )
     self.es.indices.refresh(self.name)

Example #38

0

Show file

File: parse_and_import.py Project: marianafranco/cota-parlamentar

def insert_data(file_handler):
    es = elasticsearch.Elasticsearch()

    despesas = get_despesas(codecs.getreader('utf-8')(file_handler))

    # delete index before re-importing
    if es.indices.exists(INDEX):
        es.indices.delete(INDEX)

    es.indices.create(index=INDEX, body=MAPPING_DESPESA)

    bulk = []
    for line in despesas:
        body = json.loads(line)

        bulk.append({"_index": INDEX, "_type": TYPE, "_source": body})

        if len(bulk) % 2000 == 0:
            bulk_index(es, bulk)
            bulk = []
    if bulk:
        bulk_index(es, bulk)

Example #39

0

Show file

def insert_data(file_handler):
    es = elasticsearch.Elasticsearch()

    despesas = get_despesas(codecs.getreader('utf-8')(file_handler))

    # delete index before re-importing
    if es.indices.exists(INDEX):
        es.indices.delete(INDEX)

    es.indices.create(index=INDEX, body=MAPPING_DESPESA)

    bulk = []
    for line in despesas:
        body = json.loads(line)

        bulk.append({"_index": INDEX, "_type": TYPE, "_source": body})

        if len(bulk) % 2000 == 0:
            bulk_index(es, bulk)
            bulk = []
    if bulk:
        bulk_index(es, bulk)

Example #40

0

Show file

File: indexes.py Project: cshoe/readthedocs.org

    def bulk_index(self, data, index=None, chunk_size=500, parent=None, routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {"_index": index, "_type": self._type, "_id": source["id"], "_source": source}
            if parent:
                doc["_parent"] = parent
            if routing:
                doc["_routing"] = routing
            docs.append(doc)

        # TODO: This doesn't work with the new ES setup.
        bulk_index(self.es, docs, chunk_size=chunk_size)

Example #41

0

Show file

    def test_stats_only_reports_numbers(self):
        docs = [{"answer": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client,
                                             docs,
                                             index='test-index',
                                             doc_type='answers',
                                             refresh=True,
                                             stats_only=True)

        self.assertEquals(100, success)
        self.assertEquals(0, failed)
        self.assertEquals(
            100,
            self.client.count(index='test-index', doc_type='answers')['count'])

Example #42

0

Show file

File: test_helpers.py Project: rboulton/elasticsearch-py

    def test_errors_are_collected_properly(self):
        self.client.indices.create(
            "i",
            {
                "mappings": {"t": {"properties": {"a": {"type": "integer"}}}},
                "settings": {"number_of_shards": 1, "number_of_replicas": 0},
            },
        )
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(
            self.client, [{"a": 42}, {"a": "c"}], index="i", doc_type="t", stats_only=True
        )
        self.assertEquals(1, success)
        self.assertEquals(1, failed)

Example #43

0

Show file

File: helper_import_csv.py Project: custompro98/dotfiles

    def run_request(self, index=None, doc_type=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not doc_type:
            self.show_doc_type_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename(
                "Open: ", '', index, doc_type, self.run)
            return

        docs = []
        with open(filename, encoding='utf-8', mode='r') as csvfile:
            for doc in csv.DictReader(csvfile, delimiter=','):
                docs.append(doc)
                sublime.status_message("Read: {}".format(len(docs)))

        options = dict(
            index=index,
            doc_type=doc_type,
            stats_only=False,
            chunk_size=self.settings.chunk_size,
            expand_action_callback=expand_action
        )

        success, errors = bulk_index(
            self.client, change_doc_index(docs, index), **options)

        if errors:
            return dict(
                command=self.command_name,
                index=index,
                filename=filename,
                status="ERROR",
                errors=errors
            )

        return dict(
            command=self.command_name,
            index=index,
            doc_type=doc_type,
            filename=filename,
            status="SUCCESS",
            docs=success
        )

Example #44

0

Show file

    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, '_id': x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client,
                                             docs,
                                             index='test-index',
                                             doc_type='answers',
                                             refresh=True)

        self.assertEquals(100, success)
        self.assertFalse(failed)
        self.assertEquals(
            100,
            self.client.count(index='test-index', doc_type='answers')['count'])
        self.assertEquals({"answer": 42},
                          self.client.get(index='test-index',
                                          doc_type='answers',
                                          id=42)['_source'])

Example #45

0

Show file

File: test_helpers.py Project: rboulton/elasticsearch-py

    def test_errors_are_reported_correctly(self):
        self.client.indices.create(
            "i",
            {
                "mappings": {"t": {"properties": {"a": {"type": "integer"}}}},
                "settings": {"number_of_shards": 1, "number_of_replicas": 0},
            },
        )
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(self.client, [{"a": 42}, {"a": "c", "_id": 42}], index="i", doc_type="t")
        self.assertEquals(1, success)
        self.assertEquals(1, len(failed))
        error = failed[0]
        self.assertEquals("42", error["index"]["_id"])
        self.assertEquals("t", error["index"]["_type"])
        self.assertEquals("i", error["index"]["_index"])
        self.assertIn("MapperParsingException", error["index"]["error"])

Example #46

0

Show file

File: helper_load_index_data.py Project: custompro98/dotfiles

    def run_request(self, index=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename(
                "Open: ", self.settings.dump_file, index, self.run)
            return

        count = 0
        with open(filename, encoding='utf-8', mode='r') as f:
            for docs in readlines_chunks(f):
                options = dict(
                    index=index,
                    stats_only=False,
                    chunk_size=self.settings.chunk_size,
                    expand_action_callback=expand_action
                )

                success, errors = bulk_index(
                    self.client, change_doc_index(docs, index), **options)

                if errors:
                    return dict(
                        command=self.command_name,
                        index=index,
                        filename=filename,
                        status="ERROR",
                        errors=errors
                    )

                count += success
                sublime.status_message("Load: {}".format(count))

        return dict(
            command=self.command_name,
            index=index,
            filename=filename,
            status="SUCCESS",
            docs=count
        )

Example #47

0

Show file

File: helper_import_csv.py Project: lbgws2/sublime-elasticsearch-client

    def run_request(self, index=None, doc_type=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not doc_type:
            self.show_doc_type_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename("Open: ", '', index, doc_type, self.run)
            return

        docs = []
        with open(filename, encoding='utf-8', mode='r') as csvfile:
            for doc in csv.DictReader(csvfile, delimiter=','):
                docs.append(doc)
                sublime.status_message("Read: {}".format(len(docs)))

        options = dict(index=index,
                       doc_type=doc_type,
                       stats_only=False,
                       chunk_size=self.settings.chunk_size,
                       expand_action_callback=expand_action)

        success, errors = bulk_index(self.client,
                                     change_doc_index(docs, index), **options)

        if errors:
            return dict(command=self.command_name,
                        index=index,
                        filename=filename,
                        status="ERROR",
                        errors=errors)

        return dict(command=self.command_name,
                    index=index,
                    doc_type=doc_type,
                    filename=filename,
                    status="SUCCESS",
                    docs=success)

Example #48

0

Show file

    def run_request(self, index=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename("Open: ", self.settings.dump_file, index,
                                     self.run)
            return

        count = 0
        with open(filename, encoding='utf-8', mode='r') as f:
            for docs in readlines_chunks(f):
                options = dict(index=index,
                               stats_only=False,
                               chunk_size=self.settings.chunk_size,
                               expand_action_callback=expand_action)

                success, errors = bulk_index(self.client,
                                             change_doc_index(docs, index),
                                             **options)

                if errors:
                    return dict(command=self.command_name,
                                index=index,
                                filename=filename,
                                status="ERROR",
                                errors=errors)

                count += success
                sublime.status_message("Load: {}".format(count))

        return dict(command=self.command_name,
                    index=index,
                    filename=filename,
                    status="SUCCESS",
                    docs=count)

Example #49

0

Show file

File: document_management.py Project: abarry/elasticsearch-cookbook-scripts

es.update(index=index_name,
          doc_type=type_name,
          id=2,
          body={
              "script": 'ctx._source.position += 1',
              "lang": "groovy"
          })

es.delete(index=index_name, doc_type=type_name, id=3)

from elasticsearch.helpers import bulk_index

bulk_index(es, [{
    "name": "Joe Tester",
    "parsedtext": "Joe Testere nice guy",
    "uuid": "11111",
    "position": 1,
    "date": datetime(2013, 12, 8),
    "_index": index_name,
    "_type": type_name,
    "_id": "1"
}, {
    "name": "Bill Baloney",
    "parsedtext": "Bill Testere nice guy",
    "uuid": "22222",
    "position": 2,
    "date": datetime(2013, 12, 8)
}])

es.indices.delete(index_name)

Example #50

0

Show file

File: client.py Project: baotingfang/elasticsearch-demo

def bulk_insert(body):
    bulk_index(_CLIENT, body)

Example #51

0

Show file

File: import.py Project: 425296516/photon

 def index(self, data):
     print('Start indexing batch of', len(data))
     bulk_index(es, data, index=self.ES_INDEX, doc_type='place', refresh=True)
     print('End indexing of current batch')

Example #52

0

Show file

File: sample_quickstart.py Project: alex-mcleod/elasticutils

     'product': ['Firefox', 'Firefox for mobile']},
    {'_id': 3,
     'title': 'Websites say cookies are blocked - Unblock them',
     'topics': ['cookies', 'privacy', 'websites'],
     'product': ['Firefox', 'Firefox for mobile', 'Boot2Gecko']},
    {'_id': 4,
     'title': 'Awesome Bar',
     'topics': ['tips', 'search', 'user interface'],
     'product': ['Firefox']},
    {'_id': 5,
     'title': 'Flash',
     'topics': ['flash'],
     'product': ['Firefox']}
    ]

bulk_index(es, documents, index=INDEX, doc_type=DOCTYPE)
es.indices.refresh(index=INDEX)


# Now let's do some basic queries.

# Let's build a basic S that looks at our Elasticsearch cluster and
# the index and doctype we just indexed our documents in.
basic_s = S().es(urls=[URL]).indexes(INDEX).doctypes(DOCTYPE)

# How many documents are in our index?
print basic_s.count()
# Prints:
# 5

# Print articles with 'cookie' in the title.

Example #53

0

Show file

File: sample_quickstart.py Project: safwanrahman/elasticutils

    'title': 'Websites say cookies are blocked - Unblock them',
    'topics': ['cookies', 'privacy', 'websites'],
    'product': ['Firefox', 'Firefox for mobile', 'Boot2Gecko']
}, {
    '_id': 4,
    'title': 'Awesome Bar',
    'topics': ['tips', 'search', 'user interface'],
    'product': ['Firefox']
}, {
    '_id': 5,
    'title': 'Flash',
    'topics': ['flash'],
    'product': ['Firefox']
}]

bulk_index(es, documents, index=INDEX, doc_type=DOCTYPE)
es.indices.refresh(index=INDEX)

# Now let's do some basic queries.

# Let's build a basic S that looks at our Elasticsearch cluster and
# the index and doctype we just indexed our documents in.
basic_s = S().es(urls=[URL]).indexes(INDEX).doctypes(DOCTYPE)

# How many documents are in our index?
print basic_s.count()
# Prints:
# 5

# Print articles with 'cookie' in the title.
print[item['title'] for item in basic_s.query(title__match='cookie')]

Example #54

0

Show file

File: utils.py Project: nanuxbe/bungiesearch

def update_index(model_items,
                 model_name,
                 action='index',
                 bulk_size=100,
                 num_docs=-1,
                 start_date=None,
                 end_date=None,
                 refresh=True):
    '''
    Updates the index for the provided model_items.
    :param model_items: a list of model_items (django Model instances, or proxy instances) which are to be indexed/updated or deleted.
    If action is 'index', the model_items must be serializable objects. If action is 'delete', the model_items must be primary keys
    corresponding to obects in the index.
    :param model_name: doctype, which must also be the model name.
    :param action: the action that you'd like to perform on this group of data. Must be in ('index', 'delete') and defaults to 'index.'
    :param bulk_size: bulk size for indexing. Defaults to 100.
    :param num_docs: maximum number of model_items from the provided list to be indexed.
    :param start_date: start date for indexing. Must be as YYYY-MM-DD.
    :param end_date: end date for indexing. Must be as YYYY-MM-DD.
    :param refresh: a boolean that determines whether to refresh the index, making all operations performed since the last refresh
    immediately available for search, instead of needing to wait for the scheduled Elasticsearch execution. Defaults to True.
    :note: If model_items contain multiple models, then num_docs is applied to *each* model. For example, if bulk_size is set to 5,
    and item contains models Article and Article2, then 5 model_items of Article *and* 5 model_items of Article2 will be indexed.
    '''
    src = Bungiesearch()

    if action == 'delete' and not hasattr(model_items, '__iter__'):
        raise ValueError(
            "If action is 'delete', model_items must be an iterable of primary keys."
        )

    logging.info('Getting index for model {}.'.format(model_name))
    for index_name in src.get_index(model_name):
        index_instance = src.get_model_index(model_name)
        model = index_instance.get_model()

        if num_docs == -1:
            if isinstance(model_items, (list, tuple)):
                num_docs = len(model_items)
            else:
                model_items = filter_model_items(index_instance, model_items,
                                                 model_name, start_date,
                                                 end_date)
                num_docs = model_items.count()

                if not model_items.ordered:
                    model_items = model_items.order_by('pk')
        else:
            logging.warning(
                'Limiting the number of model_items to {} to {}.'.format(
                    action, num_docs))

        logging.info('{} {} documents on index {}'.format(
            action, num_docs, index_name))
        prev_step = 0
        max_docs = num_docs + bulk_size if num_docs > bulk_size else bulk_size + 1
        for next_step in range(bulk_size, max_docs, bulk_size):
            logging.info(
                '{}: documents {} to {} of {} total on index {}.'.format(
                    action.capitalize(), prev_step, next_step, num_docs,
                    index_name))
            data = create_indexed_document(index_instance,
                                           model_items[prev_step:next_step],
                                           action)
            bulk_index(src.get_es_instance(),
                       data,
                       index=index_name,
                       doc_type=model.__name__,
                       raise_on_error=True)
            prev_step = next_step

        if refresh:
            src.get_es_instance().indices.refresh(index=index_name)

Example #55

0

Show file

File: document_management.py Project: Hafizirshaid/elasticsearch-cookbook-second-edition

index_name = "my_index"
type_name = "my_type"

from utils import create_and_add_mapping

create_and_add_mapping(es, index_name, type_name)

es.index(index=index_name, doc_type=type_name, id=1,
         body={"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1,
               "date": datetime(2013, 12, 8)})
es.index(index=index_name, doc_type=type_name + "2", id=1, body={"name": "data1", "value": "value1"}, parent=1)
es.index(index=index_name, doc_type=type_name, id=2,
         body={"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2,
               "date": datetime(2013, 12, 8)})
es.index(index=index_name, doc_type=type_name + "2", id=2, body={"name": "data2", "value": "value2"}, parent=2)
es.index(index=index_name, doc_type=type_name, id=3, body={"name": "Bill Clinton", "parsedtext": """Bill is not
        nice guy""", "uuid": "33333", "position": 3, "date": datetime(2013, 12, 8)})

es.update(index=index_name, doc_type=type_name, id=2, body={"script": 'ctx._source.position += 1', "lang":"groovy"})

es.delete(index=index_name, doc_type=type_name, id=3)

from elasticsearch.helpers import bulk_index
bulk_index(es, [{"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1,
               "date": datetime(2013, 12, 8), "_index":index_name, "_type":type_name, "_id":"1"},
               {"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2,
               "date": datetime(2013, 12, 8)}
])

es.indices.delete(index_name)

Example #56

0

Show file

File: import_bano.py Project: SunflowersOfJava/GIS-photon

def index(data):
    print('Start indexing batch of', len(data))
    bulk_index(ES, data, index=INDEX, doc_type=DOC_TYPE, refresh=True)
    print('End indexing of current batch')