Example #1
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s",
                               e)
                return

        prepped_docs = []

        for obj in iterable:
            prepped_data = index.full_prepare(obj)
            final_data = {}

            # Convert the data to make sure it's happy.
            for key, value in prepped_data.items():
                final_data[key] = self._from_python(value)
            final_data['_type'], final_data['_id'] = self.get_type_and_id(obj)

            del final_data['id']

            prepped_docs.append(final_data)

        bulk_index(self.conn, prepped_docs, index=self.index_name)

        if commit:
            self.conn.indices.refresh(index=self.index_name)
Example #2
0
def index_collection_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = CollectionCount.objects.filter(collection__in=ids)

    if qs.exists():
        log.info('Indexing %s addon collection counts: %s'
                 % (qs.count(), qs[0].date))

    data = []
    try:
        for collection_count in qs:
            collection = collection_count.collection_id
            filters = dict(collection=collection,
                           date=collection_count.date)
            data.append(search.extract_addon_collection(
                collection_count,
                AddonCollectionCount.objects.filter(**filters),
                CollectionStats.objects.filter(**filters)))
        bulk_index(es, data, index=index,
                   doc_type=CollectionCount.get_mapping_type(),
                   refresh=True)
    except Exception, exc:
        index_collection_counts.retry(args=[ids], exc=exc)
        raise
Example #3
0
def index_collection_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = CollectionCount.objects.filter(collection__in=ids)
    if qs:
        log.info('Indexing %s addon collection counts: %s' %
                 (qs.count(), qs[0].date))
    data = []
    try:
        for collection_count in qs:
            collection = collection_count.collection_id
            filters = dict(collection=collection, date=collection_count.date)
            data.append(
                search.extract_addon_collection(
                    collection_count,
                    AddonCollectionCount.objects.filter(**filters),
                    CollectionStats.objects.filter(**filters)))
        bulk_index(es,
                   data,
                   index=index,
                   doc_type=CollectionCount.get_mapping_type(),
                   refresh=True)
    except Exception, exc:
        index_collection_counts.retry(args=[ids], exc=exc)
        raise
Example #4
0
    def bulk_index(self,
                   data,
                   index=None,
                   chunk_size=500,
                   parent=None,
                   routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {
                '_index': index,
                '_type': self._type,
                '_id': source['id'],
                '_source': source,
            }
            if parent:
                doc['_parent'] = parent
            if routing:
                doc['_routing'] = routing
            docs.append(doc)

        bulk_index(self.es, docs, chunk_size=chunk_size)
Example #5
0
    def bulk_index(self, data, index=None, chunk_size=500, parent=None,
                   routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {
                '_index': index,
                '_type': self._type,
                '_id': source['id'],
                '_source': source,
            }
            if parent:
                doc['_parent'] = parent
            if routing:
                doc['_routing'] = routing
            docs.append(doc)

        bulk_index(self.es, docs, chunk_size=chunk_size)
Example #6
0
    def run(self):
        """
        Run task, namely:

        * purge existing index, if requested (`purge_existing_index`),
        * create the index, if missing,
        * apply mappings, if given,
        * set refresh interval to -1 (disable) for performance reasons,
        * bulk index in batches of size `chunk_size` (2000),
        * set refresh interval to 1s,
        * refresh Elasticsearch,
        * create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = self._init_connection()
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk_index(es, self._docs(), chunk_size=self.chunk_size,
                   raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()
Example #7
0
    def run(self):
        """
        Run task, namely:

        * purge existing index, if requested (`purge_existing_index`),
        * create the index, if missing,
        * apply mappings, if given,
        * set refresh interval to -1 (disable) for performance reasons,
        * bulk index in batches of size `chunk_size` (2000),
        * set refresh interval to 1s,
        * refresh Elasticsearch,
        * create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = self._init_connection()
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk_index(es, self._docs(), chunk_size=self.chunk_size,
                   raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()
Example #8
0
    def run(self):
        """ Purge existing index, if requested (`purge_existing_index`).
        Create the index, if missing. Apply mappings, if given.
        Set refresh interval to -1 (disable) for performance reasons.
        Bulk index in batches of size `chunk_size` (2000).
        Set refresh interval to 1s. Refresh Elasticsearch.
        Create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = elasticsearch.Elasticsearch([{'host': self.host,
                                           'port': self.port}],
                                         timeout=self.timeout)
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk_index(es, self._docs(), chunk_size=self.chunk_size,
                   raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()
Example #9
0
 def index(self, data):
     print('Start indexing batch of', len(data))
     bulk_index(es,
                data,
                index=self.ES_INDEX,
                doc_type='place',
                refresh=True)
     print('End indexing of current batch')
Example #10
0
    def bulk_index(cls, docs=ALL_DOCS):
        es = get_es()

        if docs is ALL_DOCS:
            docs = [cls.get_doctype().extract_doc(obj) for obj in cls.get_indexable()]

        if not docs:
            return

        bulk_index(es, docs, index=get_index_name(), doc_type=cls.get_doctype()._doc_type.name, raise_on_error=True)
Example #11
0
def index_all():
    es = get_es()

    resources = Resource.visible.all()
    documents = []

    for resource in resources:
        documents.append(extract_document(resource))

    bulk_index(es, documents, index=ELASTICSEARCH_INDEX_NAME, doc_type=ELASTICSEARCH_ENGINE_DOC_TYPE)

    es.indices.refresh(index=ELASTICSEARCH_INDEX_NAME)
Example #12
0
def update_index(model_items, model_name, action='index', bulk_size=100, num_docs=-1, start_date=None, end_date=None, refresh=True):
    '''
    Updates the index for the provided model_items.
    :param model_items: a list of model_items (django Model instances, or proxy instances) which are to be indexed, or updated.
    :param model_name: doctype, which must also be the model name.
    :param action: the action that you'd like to perform on this group of data. Must be in ('index', 'delete') and defaults to 'index.'
    :param bulk_size: bulk size for indexing. Defaults to 100.
    :param num_docs: maximum number of model_items from the provided list to be indexed.
    :param start_date: start date for indexing. Must be as YYYY-MM-DD.
    :param end_date: end date for indexing. Must be as YYYY-MM-DD.
    :param refresh: a boolean that determines whether to refresh the index, making all operations performed since the last refresh
    immediately available for search, instead of needing to wait for the scheduled Elasticsearch execution. Defaults to True.
    :note: If model_items contain multiple models, then num_docs is applied to *each* model. For example, if bulk_size is set to 5,
    and item contains models Article and Article2, then 5 model_items of Article *and* 5 model_items of Article2 will be indexed.
    '''
    src = Bungiesearch()

    logging.info('Getting index for model {}.'.format(model_name))
    for index_name in src.get_index(model_name):
        index_instance = src.get_model_index(model_name)
        model = index_instance.get_model()

        if num_docs == -1:
            if isinstance(model_items, (list, tuple)):
                num_docs = len(model_items)
            else:
                # Let's parse the start date and end date.
                if start_date or end_date:
                    if index_instance.updated_field is None:
                        raise ValueError('Cannot filter by date on model {}: no updated_field defined in {}\'s Meta class.'.format(model_name, index_instance.__class__.__name__))
                    if start_date:
                        model_items = model_items.filter(**{'{}__gte'.format(index_instance.updated_field): __str_to_tzdate__(start_date)})
                    if end_date:
                        model_items = model_items.filter(**{'{}__lte'.format(index_instance.updated_field): __str_to_tzdate__(end_date)})
                logging.info('Fetching number of documents to {} in {}.'.format(action, model.__name__))
                num_docs = model_items.count()
        else:
            logging.warning('Limiting the number of model_items to {} to {}.'.format(action, num_docs))

        logging.info('{} {} documents on index {}'.format(action, num_docs, index_name))
        prev_step = 0
        max_docs = num_docs + bulk_size if num_docs > bulk_size else bulk_size + 1
        for next_step in range(bulk_size, max_docs, bulk_size):
            logging.info('{}: documents {} to {} of {} total on index {}.'.format(action.capitalize(), prev_step, next_step, num_docs, index_name))
            data = [index_instance.serialize_object(doc) for doc in model_items[prev_step:next_step] if index_instance.matches_indexing_condition(doc)] 
            for entry in data:
                # Tell elasticsearch-py what to do with the data internally
                entry["_op_type"] = action
            bulk_index(src.get_es_instance(), data, index=index_name, doc_type=model.__name__, raise_on_error=True)
            prev_step = next_step

        if refresh:
            src.get_es_instance().indices.refresh(index=index_name)
Example #13
0
    def update(self, index, iterable, commit=True):
        """Update an index with a collection.

        :param index: Index to be updated.
        :type index: Index
        :param iterable: Objects to update the index.
        :type iterable: iterable
        :param commit: Commit changes.
        :type commit: bool
        """
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s", e)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)
                final_data['_id'] = final_data[ID]

                prepped_docs.append(final_data)
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        doc_type = get_model_ct(index.get_model())
        bulk_index(self.conn, prepped_docs, index=self.index_name, doc_type=doc_type)

        if commit:
            self.conn.indices.refresh(index=self.index_name)
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s",
                               e)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)
                final_data['_id'] = final_data[ID]

                prepped_docs.append(final_data)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__class__.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

        bulk_index(self.conn,
                   prepped_docs,
                   index=self.index_name,
                   doc_type='modelresult')

        if commit:
            self.conn.indices.refresh(index=self.index_name)
Example #15
0
    def index_all(self, index=None):
        # Determine which index we are indexing into
        _index = index if index is not None else self.index._index

        # Bulk Index our documents
        bulk_index(
            self.index.es,
            [{
                "_index": _index,
                "_type": self._type,
                "_id": self.extract_id(item),
                "_source": self.extract_document(item),
            } for item in self.get_indexable()],
        )
Example #16
0
def update_index(model_items, model_name, action='index', bulk_size=100, num_docs=-1, start_date=None, end_date=None, refresh=True):
    '''
    Updates the index for the provided model_items.
    :param model_items: a list of model_items (django Model instances, or proxy instances) which are to be indexed/updated or deleted.
    If action is 'index', the model_items must be serializable objects. If action is 'delete', the model_items must be primary keys
    corresponding to obects in the index.
    :param model_name: doctype, which must also be the model name.
    :param action: the action that you'd like to perform on this group of data. Must be in ('index', 'delete') and defaults to 'index.'
    :param bulk_size: bulk size for indexing. Defaults to 100.
    :param num_docs: maximum number of model_items from the provided list to be indexed.
    :param start_date: start date for indexing. Must be as YYYY-MM-DD.
    :param end_date: end date for indexing. Must be as YYYY-MM-DD.
    :param refresh: a boolean that determines whether to refresh the index, making all operations performed since the last refresh
    immediately available for search, instead of needing to wait for the scheduled Elasticsearch execution. Defaults to True.
    :note: If model_items contain multiple models, then num_docs is applied to *each* model. For example, if bulk_size is set to 5,
    and item contains models Article and Article2, then 5 model_items of Article *and* 5 model_items of Article2 will be indexed.
    '''
    src = Bungiesearch()

    if action == 'delete' and not hasattr(model_items, '__iter__'):
        raise ValueError("If action is 'delete', model_items must be an iterable of primary keys.")

    logger.info('Getting index for model {}.'.format(model_name))
    for index_name in src.get_index(model_name):
        index_instance = src.get_model_index(model_name)
        model = index_instance.get_model()

        if num_docs == -1:
            if isinstance(model_items, (list, tuple)):
                num_docs = len(model_items)
            else:
                model_items = filter_model_items(index_instance, model_items, model_name, start_date, end_date)
                num_docs = model_items.count()

                if not model_items.ordered:
                    model_items = model_items.order_by('pk')
        else:
            logger.warning('Limiting the number of model_items to {} to {}.'.format(action, num_docs))

        logger.info('{} {} documents on index {}'.format(action, num_docs, index_name))
        prev_step = 0
        max_docs = num_docs + bulk_size if num_docs > bulk_size else bulk_size + 1
        for next_step in range(bulk_size, max_docs, bulk_size):
            logger.info('{}: documents {} to {} of {} total on index {}.'.format(action.capitalize(), prev_step, next_step, num_docs, index_name))
            data = create_indexed_document(index_instance, model_items[prev_step:next_step], action)
            bulk_index(src.get_es_instance(), data, index=index_name, doc_type=model.__name__, raise_on_error=True)
            prev_step = next_step

        if refresh:
            src.get_es_instance().indices.refresh(index=index_name)
Example #17
0
def index_documents(docs, bulk=False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)
Example #18
0
def index_documents(docs: Iterable[dict], bulk: bool = False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
            raise_on_error=False,
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)
Example #19
0
    def index_data(cls, documents, id_field='id'):
        """Indexes specified data

        Uses ``cls.index_name`` as the index to index into.  Uses
        ``cls.mapping_type_name`` as the doctype to index these
        documents as.

        :arg documents: List of documents as Python dicts
        :arg id_field: The field of the document that represents the id

        """
        documents = (dict(d, _id=d[id_field]) for d in documents)
        bulk_index(cls.get_es(), documents, index=cls.index_name,
                   doc_type=cls.mapping_type_name)
        cls.refresh()
Example #20
0
    def index_data(cls, documents, index, doctype, id_field='id'):
        """Bulk indexes given data.

        This does a refresh after the data is indexed.

        :arg documents: list of python dicts each a document to index
        :arg index: name of the index
        :arg doctype: mapping type name
        :arg id_field: the field the document id is stored in in the
            document

        """
        documents = (dict(d, _id=d[id_field]) for d in documents)
        bulk_index(cls.get_es(), documents, index=index, doc_type=doctype)
        cls.refresh(index)
Example #21
0
def index_download_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = DownloadCount.objects.filter(id__in=ids)
    if qs:
        log.info('Indexing %s downloads for %s.' % (qs.count(), qs[0].date))
    try:
        data = []
        for dl in qs:
            data.append(search.extract_download_count(dl))
        bulk_index(es, data, index=index,
                   doc_type=DownloadCount.get_mapping_type(), refresh=True)
    except Exception, exc:
        index_download_counts.retry(args=[ids, index], exc=exc)
        raise
Example #22
0
    def test_errors_are_reported_correctly(self):
        self.client.indices.create(
            "i", {
                "mappings": {
                    "t": {
                        "properties": {
                            "a": {
                                "type": "integer"
                            }
                        }
                    }
                },
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                }
            })
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(self.client, [{
            "a": 42
        }, {
            "a": "c",
            '_id': 42
        }],
                                             index="i",
                                             doc_type="t")
        self.assertEquals(1, success)
        self.assertEquals(1, len(failed))
        error = failed[0]
        self.assertEquals('42', error['index']['_id'])
        self.assertEquals('t', error['index']['_type'])
        self.assertEquals('i', error['index']['_index'])
        self.assertIn('MapperParsingException', error['index']['error'])
    def test_all_documents_get_inserted(self):
        docs = [{"answer": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client, docs, index='test-index', doc_type='answers', refresh=True)

        self.assertEquals(100, len(success))
        self.assertFalse(failed)
        self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count'])
Example #24
0
def index_update_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = UpdateCount.objects.filter(id__in=ids)
    if qs:
        log.info('Indexing %s updates for %s.' % (qs.count(), qs[0].date))
    data = []
    try:
        for update in qs:
            data.append(search.extract_update_count(update))
        bulk_index(es, data, index=index,
                   doc_type=UpdateCount.get_mapping_type(), refresh=True)
    except Exception, exc:
        index_update_counts.retry(args=[ids, index], exc=exc, **kw)
        raise
    def test_stats_only_reports_numbers(self):
        docs = [{"answer": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client, docs, index='test-index', doc_type='answers', refresh=True, stats_only=True)

        self.assertEquals(100, success)
        self.assertEquals(0, failed)
        self.assertEquals(100, self.client.count(index='test-index', doc_type='answers')['count'])
Example #26
0
    def test_errors_are_collected_properly(self):
        self.client.indices.create(
            "i", {
                "mappings": {
                    "t": {
                        "properties": {
                            "a": {
                                "type": "integer"
                            }
                        }
                    }
                },
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                }
            })
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(self.client, [{
            "a": 42
        }, {
            "a": "c"
        }],
                                             index="i",
                                             doc_type="t",
                                             stats_only=True)
        self.assertEquals(1, success)
        self.assertEquals(1, failed)
Example #27
0
def index_documents(docs: Iterable[dict], bulk: bool = False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
            raise_on_error=False,
            chunk_size=300,  # TODO: Make configurable
            max_chunk_bytes=(30 * 1024 * 1024),  # TODO: Make configurable
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)
Example #28
0
    def bulk_index(cls, docs=ALL_DOCS):
        es = get_es()

        if docs is ALL_DOCS:
            docs = [
                cls.get_doctype().extract_doc(obj)
                for obj in cls.get_indexable()
            ]

        if not docs:
            return

        bulk_index(es,
                   docs,
                   index=get_index_name(),
                   doc_type=cls.get_doctype()._doc_type.name,
                   raise_on_error=True)
Example #29
0
def index_update_counts(ids, index=None, **kw):
    index = index or UpdateCountIndexer.get_index_alias()

    es = amo_search.get_es()
    qs = UpdateCount.objects.filter(id__in=ids)
    if qs.exists():
        log.info('Indexing %s updates for %s.' % (qs.count(), qs[0].date))
    data = []
    try:
        for obj in qs:
            data.append(UpdateCountIndexer.extract_document(obj))
        bulk_index(es, data, index=index,
                   doc_type=UpdateCountIndexer.get_doctype_name(),
                   refresh=True)
    except Exception as exc:
        index_update_counts.retry(args=[ids, index], exc=exc, **kw)
        raise
Example #30
0
    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client, docs, index="test-index", doc_type="answers", refresh=True)

        self.assertEquals(100, success)
        self.assertFalse(failed)
        self.assertEquals(100, self.client.count(index="test-index", doc_type="answers")["count"])
        self.assertEquals({"answer": 42}, self.client.get(index="test-index", doc_type="answers", id=42)["_source"])
Example #31
0
    def index_data(cls, documents, id_field='id'):
        """Indexes specified data

        Uses ``cls.index_name`` as the index to index into.  Uses
        ``cls.mapping_type_name`` as the doctype to index these
        documents as.

        :arg documents: List of documents as Python dicts
        :arg id_field: The field of the document that represents the id

        """
        documents = (dict(d, _id=d[id_field]) for d in documents)
        bulk_index(cls.get_es(),
                   documents,
                   index=cls.index_name,
                   doc_type=cls.mapping_type_name)
        cls.refresh()
Example #32
0
    def index_all(self, index=None):
        # Determine which index we are indexing into
        _index = index if index is not None else self.index._index

        # Bulk Index our documents
        bulk_index(
            self.index.es,
            [
                {
                    "_index": _index,
                    "_type": self._type,
                    "_id": self.extract_id(item),
                    "_source": self.extract_document(item),
                }
                for item in self.get_indexable()
            ],
        )
def index_theme_user_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = ThemeUserCount.objects.filter(id__in=ids)

    if qs.exists():
        log.info('Indexing %s theme user counts for %s.'
                 % (qs.count(), qs[0].date))
    data = []

    try:
        for user_count in qs:
            data.append(search.extract_theme_user_count(user_count))
        bulk_index(es, data, index=index,
                   doc_type=ThemeUserCount.get_mapping_type(), refresh=True)
    except Exception as exc:
        index_theme_user_counts.retry(args=[ids], exc=exc, **kw)
        raise
Example #34
0
def index_theme_user_counts(ids, index=None, **kw):
    index = index or search.get_alias()

    es = amo_search.get_es()
    qs = ThemeUserCount.objects.filter(id__in=ids)

    if qs.exists():
        log.info('Indexing %s theme user counts for %s.'
                 % (qs.count(), qs[0].date))
    data = []

    try:
        for user_count in qs:
            data.append(search.extract_theme_user_count(user_count))
        bulk_index(es, data, index=index,
                   doc_type=ThemeUserCount.get_mapping_type(), refresh=True)
    except Exception as exc:
        index_theme_user_counts.retry(args=[ids], exc=exc, **kw)
        raise
 def handle(self, *args, **options):
     documents = [
         {'id': 1,
          'title': 'No model searchable 1',
          'text': 'some text'},
         {'id': 2,
          'title': 'No model searchable 2',
          'text': 'blah'},
         {'id': 3,
          'title': 'No model searchable 3',
          'text': 'something'},
         {'id': 4,
          'title': 'No model searchable 4',
          'text': 'qwerty'},
         {'id': 5,
          'title': 'No model searchable 5',
          'text': 'hjkl'}
     ]
     es = get_es(ursl=settings.ES_URLS)
     bulk_index(es, documents, index=INDEX, doc_type=DOCTYPE)
     es.indices.refresh(index=INDEX)
Example #36
0
 def bulk_add(self, documents):
     rv = helpers.bulk_index(
         client=self.es,
         docs=({
             '_id': doc_id,
             '_index': self.name,
             '_type': self.doc_type,
             '_source': data,
         } for doc_id, data in documents),
         raise_on_error=True,
     )
     self.es.indices.refresh(self.name)
Example #37
0
 def bulk_add(self, documents):
     rv = helpers.bulk_index(
         client=self.es,
         docs=({
             '_id': doc_id,
             '_index': self.name,
             '_type': self.doc_type,
             '_source': data,
         } for doc_id, data in documents),
         raise_on_error=True,
     )
     self.es.indices.refresh(self.name)
def insert_data(file_handler):
    es = elasticsearch.Elasticsearch()

    despesas = get_despesas(codecs.getreader('utf-8')(file_handler))

    # delete index before re-importing
    if es.indices.exists(INDEX):
        es.indices.delete(INDEX)

    es.indices.create(index=INDEX, body=MAPPING_DESPESA)

    bulk = []
    for line in despesas:
        body = json.loads(line)

        bulk.append({"_index": INDEX, "_type": TYPE, "_source": body})

        if len(bulk) % 2000 == 0:
            bulk_index(es, bulk)
            bulk = []
    if bulk:
        bulk_index(es, bulk)
Example #39
0
def insert_data(file_handler):
    es = elasticsearch.Elasticsearch()

    despesas = get_despesas(codecs.getreader('utf-8')(file_handler))

    # delete index before re-importing
    if es.indices.exists(INDEX):
        es.indices.delete(INDEX)

    es.indices.create(index=INDEX, body=MAPPING_DESPESA)

    bulk = []
    for line in despesas:
        body = json.loads(line)

        bulk.append({"_index": INDEX, "_type": TYPE, "_source": body})

        if len(bulk) % 2000 == 0:
            bulk_index(es, bulk)
            bulk = []
    if bulk:
        bulk_index(es, bulk)
Example #40
0
    def bulk_index(self, data, index=None, chunk_size=500, parent=None, routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {"_index": index, "_type": self._type, "_id": source["id"], "_source": source}
            if parent:
                doc["_parent"] = parent
            if routing:
                doc["_routing"] = routing
            docs.append(doc)

        # TODO: This doesn't work with the new ES setup.
        bulk_index(self.es, docs, chunk_size=chunk_size)
Example #41
0
    def test_stats_only_reports_numbers(self):
        docs = [{"answer": x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client,
                                             docs,
                                             index='test-index',
                                             doc_type='answers',
                                             refresh=True,
                                             stats_only=True)

        self.assertEquals(100, success)
        self.assertEquals(0, failed)
        self.assertEquals(
            100,
            self.client.count(index='test-index', doc_type='answers')['count'])
Example #42
0
    def test_errors_are_collected_properly(self):
        self.client.indices.create(
            "i",
            {
                "mappings": {"t": {"properties": {"a": {"type": "integer"}}}},
                "settings": {"number_of_shards": 1, "number_of_replicas": 0},
            },
        )
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(
            self.client, [{"a": 42}, {"a": "c"}], index="i", doc_type="t", stats_only=True
        )
        self.assertEquals(1, success)
        self.assertEquals(1, failed)
Example #43
0
    def run_request(self, index=None, doc_type=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not doc_type:
            self.show_doc_type_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename(
                "Open: ", '', index, doc_type, self.run)
            return

        docs = []
        with open(filename, encoding='utf-8', mode='r') as csvfile:
            for doc in csv.DictReader(csvfile, delimiter=','):
                docs.append(doc)
                sublime.status_message("Read: {}".format(len(docs)))

        options = dict(
            index=index,
            doc_type=doc_type,
            stats_only=False,
            chunk_size=self.settings.chunk_size,
            expand_action_callback=expand_action
        )

        success, errors = bulk_index(
            self.client, change_doc_index(docs, index), **options)

        if errors:
            return dict(
                command=self.command_name,
                index=index,
                filename=filename,
                status="ERROR",
                errors=errors
            )

        return dict(
            command=self.command_name,
            index=index,
            doc_type=doc_type,
            filename=filename,
            status="SUCCESS",
            docs=success
        )
Example #44
0
    def test_all_documents_get_inserted(self):
        docs = [{"answer": x, '_id': x} for x in range(100)]
        success, failed = helpers.bulk_index(self.client,
                                             docs,
                                             index='test-index',
                                             doc_type='answers',
                                             refresh=True)

        self.assertEquals(100, success)
        self.assertFalse(failed)
        self.assertEquals(
            100,
            self.client.count(index='test-index', doc_type='answers')['count'])
        self.assertEquals({"answer": 42},
                          self.client.get(index='test-index',
                                          doc_type='answers',
                                          id=42)['_source'])
Example #45
0
    def test_errors_are_reported_correctly(self):
        self.client.indices.create(
            "i",
            {
                "mappings": {"t": {"properties": {"a": {"type": "integer"}}}},
                "settings": {"number_of_shards": 1, "number_of_replicas": 0},
            },
        )
        self.client.cluster.health(wait_for_status="yellow")

        success, failed = helpers.bulk_index(self.client, [{"a": 42}, {"a": "c", "_id": 42}], index="i", doc_type="t")
        self.assertEquals(1, success)
        self.assertEquals(1, len(failed))
        error = failed[0]
        self.assertEquals("42", error["index"]["_id"])
        self.assertEquals("t", error["index"]["_type"])
        self.assertEquals("i", error["index"]["_index"])
        self.assertIn("MapperParsingException", error["index"]["error"])
    def run_request(self, index=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename(
                "Open: ", self.settings.dump_file, index, self.run)
            return

        count = 0
        with open(filename, encoding='utf-8', mode='r') as f:
            for docs in readlines_chunks(f):
                options = dict(
                    index=index,
                    stats_only=False,
                    chunk_size=self.settings.chunk_size,
                    expand_action_callback=expand_action
                )

                success, errors = bulk_index(
                    self.client, change_doc_index(docs, index), **options)

                if errors:
                    return dict(
                        command=self.command_name,
                        index=index,
                        filename=filename,
                        status="ERROR",
                        errors=errors
                    )

                count += success
                sublime.status_message("Load: {}".format(count))

        return dict(
            command=self.command_name,
            index=index,
            filename=filename,
            status="SUCCESS",
            docs=count
        )
    def run_request(self, index=None, doc_type=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not doc_type:
            self.show_doc_type_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename("Open: ", '', index, doc_type, self.run)
            return

        docs = []
        with open(filename, encoding='utf-8', mode='r') as csvfile:
            for doc in csv.DictReader(csvfile, delimiter=','):
                docs.append(doc)
                sublime.status_message("Read: {}".format(len(docs)))

        options = dict(index=index,
                       doc_type=doc_type,
                       stats_only=False,
                       chunk_size=self.settings.chunk_size,
                       expand_action_callback=expand_action)

        success, errors = bulk_index(self.client,
                                     change_doc_index(docs, index), **options)

        if errors:
            return dict(command=self.command_name,
                        index=index,
                        filename=filename,
                        status="ERROR",
                        errors=errors)

        return dict(command=self.command_name,
                    index=index,
                    doc_type=doc_type,
                    filename=filename,
                    status="SUCCESS",
                    docs=success)
Example #48
0
    def run_request(self, index=None, filename=None):
        if not index:
            self.show_index_list_panel(self.run)
            return

        if not filename:
            self.show_input_filename("Open: ", self.settings.dump_file, index,
                                     self.run)
            return

        count = 0
        with open(filename, encoding='utf-8', mode='r') as f:
            for docs in readlines_chunks(f):
                options = dict(index=index,
                               stats_only=False,
                               chunk_size=self.settings.chunk_size,
                               expand_action_callback=expand_action)

                success, errors = bulk_index(self.client,
                                             change_doc_index(docs, index),
                                             **options)

                if errors:
                    return dict(command=self.command_name,
                                index=index,
                                filename=filename,
                                status="ERROR",
                                errors=errors)

                count += success
                sublime.status_message("Load: {}".format(count))

        return dict(command=self.command_name,
                    index=index,
                    filename=filename,
                    status="SUCCESS",
                    docs=count)
es.update(index=index_name,
          doc_type=type_name,
          id=2,
          body={
              "script": 'ctx._source.position += 1',
              "lang": "groovy"
          })

es.delete(index=index_name, doc_type=type_name, id=3)

from elasticsearch.helpers import bulk_index

bulk_index(es, [{
    "name": "Joe Tester",
    "parsedtext": "Joe Testere nice guy",
    "uuid": "11111",
    "position": 1,
    "date": datetime(2013, 12, 8),
    "_index": index_name,
    "_type": type_name,
    "_id": "1"
}, {
    "name": "Bill Baloney",
    "parsedtext": "Bill Testere nice guy",
    "uuid": "22222",
    "position": 2,
    "date": datetime(2013, 12, 8)
}])

es.indices.delete(index_name)
Example #50
0
def bulk_insert(body):
    bulk_index(_CLIENT, body)
Example #51
0
 def index(self, data):
     print('Start indexing batch of', len(data))
     bulk_index(es, data, index=self.ES_INDEX, doc_type='place', refresh=True)
     print('End indexing of current batch')
     'product': ['Firefox', 'Firefox for mobile']},
    {'_id': 3,
     'title': 'Websites say cookies are blocked - Unblock them',
     'topics': ['cookies', 'privacy', 'websites'],
     'product': ['Firefox', 'Firefox for mobile', 'Boot2Gecko']},
    {'_id': 4,
     'title': 'Awesome Bar',
     'topics': ['tips', 'search', 'user interface'],
     'product': ['Firefox']},
    {'_id': 5,
     'title': 'Flash',
     'topics': ['flash'],
     'product': ['Firefox']}
    ]

bulk_index(es, documents, index=INDEX, doc_type=DOCTYPE)
es.indices.refresh(index=INDEX)


# Now let's do some basic queries.

# Let's build a basic S that looks at our Elasticsearch cluster and
# the index and doctype we just indexed our documents in.
basic_s = S().es(urls=[URL]).indexes(INDEX).doctypes(DOCTYPE)

# How many documents are in our index?
print basic_s.count()
# Prints:
# 5

# Print articles with 'cookie' in the title.
    'title': 'Websites say cookies are blocked - Unblock them',
    'topics': ['cookies', 'privacy', 'websites'],
    'product': ['Firefox', 'Firefox for mobile', 'Boot2Gecko']
}, {
    '_id': 4,
    'title': 'Awesome Bar',
    'topics': ['tips', 'search', 'user interface'],
    'product': ['Firefox']
}, {
    '_id': 5,
    'title': 'Flash',
    'topics': ['flash'],
    'product': ['Firefox']
}]

bulk_index(es, documents, index=INDEX, doc_type=DOCTYPE)
es.indices.refresh(index=INDEX)

# Now let's do some basic queries.

# Let's build a basic S that looks at our Elasticsearch cluster and
# the index and doctype we just indexed our documents in.
basic_s = S().es(urls=[URL]).indexes(INDEX).doctypes(DOCTYPE)

# How many documents are in our index?
print basic_s.count()
# Prints:
# 5

# Print articles with 'cookie' in the title.
print[item['title'] for item in basic_s.query(title__match='cookie')]
Example #54
0
def update_index(model_items,
                 model_name,
                 action='index',
                 bulk_size=100,
                 num_docs=-1,
                 start_date=None,
                 end_date=None,
                 refresh=True):
    '''
    Updates the index for the provided model_items.
    :param model_items: a list of model_items (django Model instances, or proxy instances) which are to be indexed/updated or deleted.
    If action is 'index', the model_items must be serializable objects. If action is 'delete', the model_items must be primary keys
    corresponding to obects in the index.
    :param model_name: doctype, which must also be the model name.
    :param action: the action that you'd like to perform on this group of data. Must be in ('index', 'delete') and defaults to 'index.'
    :param bulk_size: bulk size for indexing. Defaults to 100.
    :param num_docs: maximum number of model_items from the provided list to be indexed.
    :param start_date: start date for indexing. Must be as YYYY-MM-DD.
    :param end_date: end date for indexing. Must be as YYYY-MM-DD.
    :param refresh: a boolean that determines whether to refresh the index, making all operations performed since the last refresh
    immediately available for search, instead of needing to wait for the scheduled Elasticsearch execution. Defaults to True.
    :note: If model_items contain multiple models, then num_docs is applied to *each* model. For example, if bulk_size is set to 5,
    and item contains models Article and Article2, then 5 model_items of Article *and* 5 model_items of Article2 will be indexed.
    '''
    src = Bungiesearch()

    if action == 'delete' and not hasattr(model_items, '__iter__'):
        raise ValueError(
            "If action is 'delete', model_items must be an iterable of primary keys."
        )

    logging.info('Getting index for model {}.'.format(model_name))
    for index_name in src.get_index(model_name):
        index_instance = src.get_model_index(model_name)
        model = index_instance.get_model()

        if num_docs == -1:
            if isinstance(model_items, (list, tuple)):
                num_docs = len(model_items)
            else:
                model_items = filter_model_items(index_instance, model_items,
                                                 model_name, start_date,
                                                 end_date)
                num_docs = model_items.count()

                if not model_items.ordered:
                    model_items = model_items.order_by('pk')
        else:
            logging.warning(
                'Limiting the number of model_items to {} to {}.'.format(
                    action, num_docs))

        logging.info('{} {} documents on index {}'.format(
            action, num_docs, index_name))
        prev_step = 0
        max_docs = num_docs + bulk_size if num_docs > bulk_size else bulk_size + 1
        for next_step in range(bulk_size, max_docs, bulk_size):
            logging.info(
                '{}: documents {} to {} of {} total on index {}.'.format(
                    action.capitalize(), prev_step, next_step, num_docs,
                    index_name))
            data = create_indexed_document(index_instance,
                                           model_items[prev_step:next_step],
                                           action)
            bulk_index(src.get_es_instance(),
                       data,
                       index=index_name,
                       doc_type=model.__name__,
                       raise_on_error=True)
            prev_step = next_step

        if refresh:
            src.get_es_instance().indices.refresh(index=index_name)
index_name = "my_index"
type_name = "my_type"

from utils import create_and_add_mapping

create_and_add_mapping(es, index_name, type_name)

es.index(index=index_name, doc_type=type_name, id=1,
         body={"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1,
               "date": datetime(2013, 12, 8)})
es.index(index=index_name, doc_type=type_name + "2", id=1, body={"name": "data1", "value": "value1"}, parent=1)
es.index(index=index_name, doc_type=type_name, id=2,
         body={"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2,
               "date": datetime(2013, 12, 8)})
es.index(index=index_name, doc_type=type_name + "2", id=2, body={"name": "data2", "value": "value2"}, parent=2)
es.index(index=index_name, doc_type=type_name, id=3, body={"name": "Bill Clinton", "parsedtext": """Bill is not
        nice guy""", "uuid": "33333", "position": 3, "date": datetime(2013, 12, 8)})

es.update(index=index_name, doc_type=type_name, id=2, body={"script": 'ctx._source.position += 1', "lang":"groovy"})

es.delete(index=index_name, doc_type=type_name, id=3)

from elasticsearch.helpers import bulk_index
bulk_index(es, [{"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1,
               "date": datetime(2013, 12, 8), "_index":index_name, "_type":type_name, "_id":"1"},
               {"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2,
               "date": datetime(2013, 12, 8)}
])

es.indices.delete(index_name)
def index(data):
    print('Start indexing batch of', len(data))
    bulk_index(ES, data, index=INDEX, doc_type=DOC_TYPE, refresh=True)
    print('End indexing of current batch')