Ejemplo n.º 1
0
def migrate_chunk(chunk, skip_files=False):
    models_committed.disconnect(index_after_commit)

    index_queue = []

    try:
        for raw_record in chunk:
            with db.session.begin_nested():
                record = migrate_and_insert_record(
                    raw_record,
                    skip_files=skip_files,
                )
                if record:
                    index_queue.append(create_index_op(record))
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(index_after_commit)
Ejemplo n.º 2
0
def migrate_chunk(chunk):
    models_committed.disconnect(receive_after_model_commit)
    current_collections.unregister_signals()

    index_queue = []

    try:
        for raw_record in chunk:
            with db.session.begin_nested():
                record = migrate_and_insert_record(raw_record)
                if record:
                    index_queue.append(create_index_op(record))
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(receive_after_model_commit)
    current_collections.register_signals()
Ejemplo n.º 3
0
def bulk(iterable, index=INDEX_NAME, doc_type=DOC_TYPE, action='index'):
    """
    Wrapper of elasticsearch's bulk method

    Converts an interable of models to document operations and submits them to
    Elasticsearch.  Returns a count of operations when done.

    https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk
    https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
    """
    actions = compact(
        dict_to_op(
            to_dict(model),
            index_name=INDEX_NAME,
            doc_type=DOC_TYPE,
            op_type=action,
        ) for model in iterable)

    # fail fast if there are no actions
    if not actions:
        return 0

    items, _ = es_bulk(es_conn, actions, doc_type=doc_type, index=index)

    return items
Ejemplo n.º 4
0
    def consumerStats(self):
        consumers = self.get_data('/api/consumers')

        consumer_count = len(consumers)
        consumer_stats = {
            '@timestamp': arrow.utcnow().format('YYYY-MM-DDTHH:mm:ssZ'),
            'rabbit_connection': self.conn_name,
            'consumers_current': consumer_count
        }
        es.index(index=get_es_index(),
                 body=consumer_stats,
                 doc_type='consumer-stats')

        items = []
        for consumer in consumers:
            for fields in BLACKLIST_FIELDS.get('consumerStats', []):
                consumer = delete_keys_from_dict(consumer, fields)
            consumer.update({
                '@timestamp':
                arrow.utcnow().format('YYYY-MM-DDTHH:mm:ssZ'),
                'rabbit_connection':
                self.conn_name,
                '_index':
                get_es_index(),
                '_type':
                'consumer-stats'
            })
            items.append(consumer)
        indexit = es_bulk(es, items)
        logger.info(
            "All done with consumerStats on connection: %s, items_inserted: %s, errors: %s"
            % (self.conn_name, indexit[0], indexit[1]))
Ejemplo n.º 5
0
def index_model(label):
    logger.debug('index_model')
    Model = None
    SerializerClass = None
    try:
        Model = apps.get_model(label)
    except LookupError as e:
        logger.error(e)
        raise e
    try:
        SerializerClass = search_config.get_serializer_for_model(label)
    except LookupError as e:
        logger.error(e)
        raise e

    if Model and SerializerClass:
        serializer = SerializerClass()
        conn = connections.get_connection()  # Get default connection

        queryset = Model.objects.all()
        if hasattr(queryset, 'published'):
            queryset = queryset.published()
        if serializer.related_object_fields:
            queryset = queryset.prefetch_related(*serializer.related_object_fields)

        model_docs = (serializer.create_document(item) for item in queryset)
        doc_dicts = (doc.to_dict(include_meta=True) for doc in model_docs)

        return es_bulk(conn, doc_dicts)
Ejemplo n.º 6
0
def index_model(label):
    logger.debug('index_model')
    Model = None
    SerializerClass = None
    try:
        Model = apps.get_model(label)
    except LookupError as e:
        logger.error(e)
        raise e
    try:
        SerializerClass = search_config.get_serializer_for_model(label)
    except LookupError as e:
        logger.error(e)
        raise e

    if Model and SerializerClass:
        serializer = SerializerClass()
        conn = connections.get_connection()  # Get default connection

        queryset = Model.objects.all()
        if hasattr(queryset, 'published'):
            queryset = queryset.published()
        if serializer.related_object_fields:
            queryset = queryset.prefetch_related(
                *serializer.related_object_fields)

        model_docs = (serializer.create_document(item) for item in queryset)
        doc_dicts = (doc.to_dict(include_meta=True) for doc in model_docs)

        return es_bulk(conn, doc_dicts)
Ejemplo n.º 7
0
def bulk(ops, **kwargs):
    """
    A wrapper for elasticsearch.helpers.bulk() that waits for a yellow
    cluster and uses our ES client.
    """
    wait_for_yellow()
    return es_bulk(client, ops, **kwargs)
Ejemplo n.º 8
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {
                        'citation_count': citation_count
                    }
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(
            es_scan(es,
                    query={
                        '_source': 'references.recid',
                        'filter': {
                            'exists': {
                                'field': 'references.recid'
                            }
                        },
                        'size': LARGE_CHUNK_SIZE
                    },
                    scroll=u'2m',
                    index=index,
                    doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(
                list(
                    chain.from_iterable(
                        map(force_list,
                            get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo(
        '... DONE: {} records updated with success. {} failures.'.format(
            success, failed))
Ejemplo n.º 9
0
def collect_tweets(
    es_client,
    track,
    twitter_consumer_key,
    twitter_consumer_secret,
    twitter_access_token_key,
    twitter_access_token_secret,
    elasticsearch_index="profanity-power-index",
    drop_index=False,
    batch_size=10,
):

    if es_client.indices.exists(elasticsearch_index):
        logger.warning(f"Index {elasticsearch_index} exists.")
        if drop_index:
            logger.warning(f"Dropping {elasticsearch_index}.")
            es_client.indices.delete(elasticsearch_index)
            logger.info(f"Creating {elasticsearch_index}.")
            es_client.indices.create(index=elasticsearch_index,
                                     body=TWEET_MAPPING)
    else:
        logger.info(f"Creating {elasticsearch_index}.")
        es_client.indices.create(index=elasticsearch_index, body=TWEET_MAPPING)
        logger.info(f"{elasticsearch_index} successfully created.")

    api = twitter.Api(
        consumer_key=twitter_consumer_key,
        consumer_secret=twitter_consumer_secret,
        access_token_key=twitter_access_token_key,
        access_token_secret=twitter_access_token_secret,
    )

    logger.info(f"Connecting to twitter stream. Tracking {', '.join(track)}.")
    tweet_stream = api.GetStreamFilter(track=track)

    tweet_to_bulk = curry(_tweet_to_bulk)(elasticsearch_index)
    tweet_doc_stream = thread_last(
        tweet_stream,
        # Filter out tweets that don't contain profanity.
        (filter, _contains_profanity),
        # Convert the tweets to a bulk-indexable document.
        (map, tweet_to_bulk),
        # Partition for bulk writes.
        (partition_all, batch_size),
    )

    logger.info(f"Sending tweets to {elasticsearch_index}.")
    failed = 0
    succeeded = 0
    logger.info(f"{failed + succeeded} tweets processed: "
                f"{succeeded} succeeded, {failed} failed.")
    # Since the doc stream is partitioned we get the tweets in batches.
    for tweet_batch in tweet_doc_stream:
        ok, fail = es_bulk(es_client, tweet_batch, stats_only=True)
        succeeded += ok
        failed += fail
        if (failed + succeeded) % 100 == 0:
            logger.info(f"{failed + succeeded} tweets processed: "
                        f"{succeeded} succeeded, {failed} failed.")
Ejemplo n.º 10
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for record in chunk:
            recid = json = None
            try:
                recid, json = create_record(record,
                                            force=True, dry_run=dry_run)
                index = get_record_index(json) or \
                    cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                before_record_index.send(recid, json=json, index=index)
                json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0})
                records_to_index.append(json)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if broken_output:
                    broken_output_fd = open(broken_output, "a")
                    print(record, file=broken_output_fd)

        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()
Ejemplo n.º 11
0
def add_citation_counts():
    from elasticsearch.helpers import bulk as es_bulk
    from elasticsearch.helpers import scan as es_scan
    from collections import Counter

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            yield {'_op_type': 'update',
                   '_index': 'hep',
                   '_type': 'record',
                   '_id': recid,
                   'doc': {'citation_count': citation_count}
                   }

    logger.info("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    for i, record in enumerate(es_scan(
            es,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index="hep",
            doc_type="record")):

        # update lookup dictionary based on references of the record
        if 'references' in record['_source']:
            unique_refs_ids = set()
            references = record['_source']['references']
            for reference in references:
                recid = reference.get('recid')
                if recid:
                    if isinstance(recid, list):
                        # Sometimes there is more than one recid in the
                        # reference.
                        recid = recid.pop()
                    unique_refs_ids.add(recid)

        for unique_refs_id in unique_refs_ids:
            citations_lookup[unique_refs_id] += 1

        if (i + 1) % LARGE_CHUNK_SIZE == 0:
            logger.info("Extracted citations from {} records".format(i + 1))

    logger.info("... DONE.")
    logger.info("Adding citation numbers...")

    success, failed = es_bulk(es, get_records_to_update_generator(citations_lookup), raise_on_exception=False, raise_on_error=False, stats_only=True)
    logger.info("... DONE: {} records updated with success. {} failures.".format(success, failed))
    def index_website(self):
        """Start indexing the website and writing to elasticsearch"""

        logger.info('Indexing website: {0}'.format(self.base_url))

        def es_feeder(objects, index, doc_type):
            for obj in objects:
                logger.debug('Indexing object type={0} id={1}'
                             .format(doc_type, obj['_id']))
                yield {'_op_type': 'index',
                       '_index': index,
                       '_type': doc_type,
                       '_id': obj.pop('_id'),
                       '_source': obj}

        class_types = self.get_class_types()

        for clsdef in class_types:
            logger.info(u'Scanning object class: {0} "{1}"'
                        .format(clsdef['identifier'], clsdef['name']))

            # todo: put mappings for this type

            objects = self.scan_pages(clsdef['link'])
            doc_type = clsdef['identifier']

            if not self.full:
                # Filter out already existing objects
                already = set(self.all_type_ids(doc_type))
                logger.debug(
                    'Excluding from download {0} already existing objects'
                    .format(len(already)))
                _objects = objects
                objects = (o for o in _objects
                           if int(o['nodeId']) not in already)

            # Prepare objects for insertion..
            objects = (self.process_object(doc_type, raw_obj)
                       for raw_obj in objects)

            actions = es_feeder(objects, self.es_index, doc_type)
            es_bulk(self.es, actions=actions, chunk_size=50)
            self.es.indices.flush()
Ejemplo n.º 13
0
def migrate_chunk(chunk):
    index_queue = []

    try:
        for raw_record in chunk:
            with db.session.begin_nested():
                record = migrate_and_insert_record(raw_record)
                if record:
                    index_queue.append(create_index_op(record))
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        current_search_client,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )
Ejemplo n.º 14
0
def migrate_chunk(chunk):
    index_queue = []

    try:
        for raw_record in chunk:
            with db.session.begin_nested():
                record = migrate_and_insert_record(raw_record)
                if record:
                    index_queue.append(create_index_op(record))
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )
Ejemplo n.º 15
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {'citation_count': citation_count}
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(es_scan(
            es,
            query={
                '_source': 'references.recid',
                'filter': {
                    'exists': {
                        'field': 'references.recid'
                    }
                },
                'size': LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(list(chain.from_iterable(map(
                force_list, get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo('... DONE: {} records updated with success. {} failures.'.format(
        success, failed))
Ejemplo n.º 16
0
    def create_metric_index_to_es(self, metric):
        metric_node_list = metric.split('.')
        actions = []
        for i in range(0, len(metric_node_list)):
            if i:
                name = '.'.join(metric_node_list[:i+1])
                parent = '.'.join(metric_node_list[:i])
            else:
                name, parent = metric_node_list[i], ''

            node = metric_node_list[i]
            leaf = 0
            tag = {}
            if name == metric:
                leaf = 1
                if name.find(';') > -1:
                    tag = {kv.split(':')[0]: kv.split(':')[1] for kv in name.split(';')[1:]}

            body = {'query': {'term': {'name': name}}}
            res = self.es.search(index='metric', doc_type='_doc', body=body)

            if not res.get('hits', {}).get('hits'):
                action = {
                    '_index': self.es_index,
                    '_id': name,
                    '_source': {
                        'name': name,
                        'hash': hashlib.md5(name.encode(encoding='utf-8')).hexdigest(),
                        'alias': '',
                        'path': name.split(';')[0] if tag else name,
                        'parent': parent,
                        'text': node,
                        'leaf': leaf,
                        'tag': tag
                    }
                }
                actions.append(action)

        if actions:
            es_bulk(self.es, actions)
Ejemplo n.º 17
0
def migrate_chunk(chunk):
    index_queue = []
    try:
        for raw_record in chunk:
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            prod_record = InspireProdRecords(recid=recid)
            prod_record.marcxml = raw_record
            json_record = create_record(record)
            with db.session.begin_nested():
                try:
                    record = record_upsert(json_record)
                except ValidationError as e:
                    # Invalid record, will not get indexed
                    errors = "ValidationError: Record {0}: {1}".format(
                        recid, e
                    )
                    current_app.logger.warning(errors)
                    prod_record.valid = False
                    prod_record.errors = errors
                    db.session.merge(prod_record)
                    continue

                index_queue.append(create_index_op(record))

                prod_record.valid = True
                db.session.merge(prod_record)
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        current_search_client,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )
Ejemplo n.º 18
0
def bulk(
    actions=None,
    chunk_size=500,
    max_chunk_bytes=settings.ES_BULK_MAX_CHUNK_BYTES,
    **kwargs,
):
    """Send data in bulk to Elasticsearch."""
    return es_bulk(
        get_client(),
        actions=actions,
        chunk_size=chunk_size,
        max_chunk_bytes=max_chunk_bytes,
        **kwargs,
    )
Ejemplo n.º 19
0
def migrate_recids_from_mirror(prod_recids, skip_files=False):
    models_committed.disconnect(index_after_commit)

    index_queue = []

    for recid in prod_recids:
        with db.session.begin_nested():
            record = migrate_record_from_mirror(
                LegacyRecordsMirror.query.get(recid),
                skip_files=skip_files,
            )
            if record and not record.get('deleted'):
                index_queue.append(create_index_op(record))
    db.session.commit()
    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(index_after_commit)
Ejemplo n.º 20
0
 def es_bulk_insert_versioned(self, docs):
     """"
     Insert docs in the corresponding ElasticSearch index using the bulk method
     :param docs: list of dicts representing documents
     :return: tuple of (num successful, num failed or up to date) writes
     """
     return es_bulk(self.es_session, ({
         "_index": self.es_index,
         "_type": self.es_type,
         "_id": doc["id"],
         "_version": doc["version"],
         "_version_type": "external",
         "_source": doc
     } for doc in docs),
                    stats_only=True)
Ejemplo n.º 21
0
def migrate_recids_from_mirror(prod_recids, skip_files=False):
    models_committed.disconnect(index_after_commit)

    index_queue = []

    for recid in prod_recids:
        with db.session.begin_nested():
            record = migrate_record_from_mirror(
                LegacyRecordsMirror.query.get(recid),
                skip_files=skip_files,
            )
            if record and not record.get('deleted'):
                index_queue.append(create_index_op(record))
    db.session.commit()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(index_after_commit)
Ejemplo n.º 22
0
    def add_documents(
        self,
        index: str,
        documents: List[Dict[str, Any]],
        routing: Callable[[Dict[str, Any]], str] = None,
        doc_id: Callable[[Dict[str, Any]], str] = None,
    ) -> int:
        """
        Adds or updated a set of documents to an index. Documents can contains
        partial information of document.

        See <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html>

        Parameters
        ----------
        index:
            The index name
        documents:
            The set of documents
        routing:
            The routing key
        doc_id

        Returns
        -------
            The number of failed documents
        """

        def map_doc_2_action(doc: Dict[str, Any]) -> Dict[str, Any]:
            """Configures bulk action"""
            return {
                "_op_type": "update",
                "_index": index,
                "_id": doc_id(doc) if doc_id else doc["_id"],
                "_routing": routing(doc) if routing else None,
                "doc": doc,
                "doc_as_upsert": True,
            }

        success, failed = es_bulk(
            self.__client__,
            index=index,
            actions=map(map_doc_2_action, documents),
            raise_on_error=True,
            refresh="wait_for",
        )
        return len(failed)
Ejemplo n.º 23
0
    def BulkIndexRecords(self, records):
        '''
        Bulk Index Records
        IN
            self: EsHandler
            records: a list of records to bulk index
        '''
        logging.debug('[starting] Indexing Bulk Records')
        success_count, failed_items = es_bulk(self.esh,
                                              records,
                                              chunk_size=10000,
                                              raise_on_error=False)

        if len(failed_items) > 0:
            logging.error('{} index errors'.format(len(failed_items)))
            index_error_file = open("IndexErrors.txt", "a+")
            index_error_file.write(str(failed_items) + "\n")
            index_error_file.close()

        logging.debug('[finished] Indexing Bulk Records')
Ejemplo n.º 24
0
    def BulkIndexRecords(self, records):
        '''
        Bulk Index Records
        IN
            self: EsHandler
            records: a list of records to bulk index
        '''
        ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records')
        success_count, failed_items = es_bulk(self.esh,
                                              records,
                                              chunk_size=10000,
                                              raise_on_error=False)

        if len(failed_items) > 0:
            ELASTIC_LOGGER.error('[PID {}] {} index errors'.format(
                os.getpid(), len(failed_items)))
            for failed_item in failed_items:
                ELASTIC_LOGGER.error(unicode(failed_item))

        ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')
Ejemplo n.º 25
0
    def queueStats(self):
        queues = self.get_data('/api/queues')
        if queues is not None:
            items = []
            for queue in queues:
                for fields in BLACKLIST_FIELDS.get('queueStats', []):
                    delete_keys_from_dict(queue, fields)

                es_stuff = {
                    '@timestamp':
                    arrow.utcnow().format('YYYY-MM-DDTHH:mm:ssZ'),
                    'rabbit_connection': self.conn_name,
                    '_index': get_es_index(),
                    '_type': 'queue-stats',
                }
                es_stuff.update(queue)
                items.append(es_stuff)
            indexit = es_bulk(es, items)
        logger.info(
            "All done with queueStats on connection: %s, items_inserted: %s, errors: %s"
            % (self.conn_name, indexit[0], indexit[1]))
Ejemplo n.º 26
0
 def BulkIndexRecords(self,records):
     '''
     Bulk Index Records
     IN
         self: EsHandler
         records: a list of records to bulk index
     '''
     logging.debug('[starting] Indexing Bulk Records')
     success_count,failed_items = es_bulk(
         self.esh,
         records,
         chunk_size=10000,
         raise_on_error=False
     )
     
     if len(failed_items) > 0:
         logging.error('{} index errors'.format(len(failed_items)))
         index_error_file = open("IndexErrors.txt", "a+")
         index_error_file.write(str(failed_items)+"\n")
         index_error_file.close()
     
     logging.debug('[finished] Indexing Bulk Records')
Ejemplo n.º 27
0
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client):
    data_dir = utils.get_dir_for('transform02')
    batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format(
        domain=domain, batch_id=batch_id
    )
    data_file = open(data_dir + '/' + batch_filename, 'r')
    run_id = data_file.readline().rstrip()
    events = json.loads(data_file.readline())

    if dry_run:
        actions = [{
            '_index': script_settings.ES_INDEX,
            '_type': domain + '-pageviews',
            '_source': event,
        } for event in events]

        stats = es_bulk(
            client=es_client, stats_only=True, actions=actions,
        )
        tally[domain + '-' + str(batch_id)] = stats
    else:
        keen_client.add_events({'pageviews': events})
Ejemplo n.º 28
0
 def BulkIndexRecords(self,records):
     '''
     Bulk Index Records
     IN
         self: EsHandler
         records: a list of records to bulk index
     '''
     ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records')
     success_count,failed_items = es_bulk(
         self.esh,
         records,
         chunk_size=10000,
         raise_on_error=False
     )
     
     if len(failed_items) > 0:
         ELASTIC_LOGGER.error('[PID {}] {} index errors'.format(
             os.getpid(),len(failed_items)
         ))
         for failed_item in failed_items:
             ELASTIC_LOGGER.error(unicode(failed_item))
     
     ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')
Ejemplo n.º 29
0
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client):
    data_dir = utils.get_dir_for('transform02')
    batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format(
        domain=domain, batch_id=batch_id)
    events = []
    with open(data_dir + '/' + batch_filename, 'r') as data_file:
        run_id = data_file.readline().rstrip()
        events = json.loads(data_file.readline())

    if dry_run:
        actions = [{
            '_index': script_settings.ES_INDEX,
            '_type': domain + '-pageviews',
            '_source': event,
        } for event in events]

        stats = es_bulk(
            client=es_client,
            stats_only=True,
            actions=actions,
        )
        tally[domain + '-' + str(batch_id)] = stats
    else:
        keen_client.add_events({'pageviews': events})
Ejemplo n.º 30
0
def bulk(iterable, index=INDEX_NAME, doc_type=DOC_TYPE, action='index'):
    """
    Wrapper of elasticsearch's bulk method

    Converts an interable of models to document operations and submits them to
    Elasticsearch.  Returns a count of operations when done.

    https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk
    https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
    """
    actions = compact(dict_to_op(
        to_dict(model),
        index_name=INDEX_NAME,
        doc_type=DOC_TYPE,
        op_type=action,
    ) for model in iterable)

    # fail fast if there are no actions
    if not actions:
        return 0

    items, _ = es_bulk(es_conn, actions, doc_type=doc_type, index=index)

    return items
Ejemplo n.º 31
0
    def flush(self):
        """flush buffer"""

        res = es_bulk(self.esclient, self.buf)
        self.buf = []
        return res
Ejemplo n.º 32
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for raw_record in chunk:
            json = None
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            if not dry_run:
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
            try:
                with db.session.begin_nested():
                    errors, recid, json = create_record(
                        recid, record, force=True,
                        dry_run=dry_run, validation=True
                    )
                    if dry_run:
                        continue
                    prod_record.valid = not errors
                    prod_record.errors = errors
                    index = get_record_index(json) or \
                        cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                    before_record_index.send(recid, json=json, index=index)
                    json.update({'_index': index, '_type': 'record',
                                 '_id': recid, 'citation_count': 0})
                    records_to_index.append(json)
                    prod_record.successful = True
                    db.session.merge(prod_record)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if not dry_run:
                    prod_record.successful = False
                    db.session.merge(prod_record)
        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        if not dry_run:
            es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()
Ejemplo n.º 33
0
def add_citation_counts(chunk_size=500, request_timeout=10):
    index, doc_type = schema_to_index('records/hep.json')

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            try:
                uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid
                yield {'_op_type': 'update',
                       '_index': index,
                       '_type': doc_type,
                       '_id': str(uuid),
                       'doc': {'citation_count': citation_count}
                       }
            except NoResultFound:
                continue

    click.echo("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    with click.progressbar(es_scan(
            current_search_client,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            # update lookup dictionary based on references of the record
            if 'references' in record['_source']:
                unique_refs_ids = set()
                references = record['_source']['references']
                for reference in references:
                    recid = reference.get('recid')
                    if recid:
                        if isinstance(recid, list):
                            # Sometimes there is more than one recid in the
                            # reference.
                            recid = recid.pop()
                        unique_refs_ids.add(recid)

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1

    click.echo("... DONE.")
    click.echo("Adding citation numbers...")

    success, failed = es_bulk(
        current_search_client,
        get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True)
    click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
Ejemplo n.º 34
0
    def save_counters(self):
        """
        Tworzy lub uaktualnia liczniki w bazie danych i elasticsearch'u
        na podstawie wartości zapisanych w redis od ostatniej aktualizacji.

        Metoda powinna być wołana jako zadanie przez CRON.
        """

        es_actions = []
        datasets_downloads = {}

        for oper in (VIEWS_COUNT_PREFIX, DOWNLOADS_COUNT_PREFIX):
            last_save = self._get_last_save(oper)
            self.con.set(f'{oper}_last_save',
                         str(int(datetime.datetime.now().timestamp())))

            for view in settings.COUNTED_VIEWS:
                model_name = view[:-1].title()
                model = apps.get_model(view, model_name)
                model.is_indexable = False
                for k in self.con.scan_iter(f'{oper}:{last_save}:{view}:*'):
                    obj_id = int(k.decode().split(':')[-1])
                    try:
                        obj = model.objects.get(pk=obj_id)
                    except model.DoesNotExist:
                        self.con.delete(k)
                        continue

                    incr_val = int(self.con.get(k))
                    counter = getattr(obj, oper) + incr_val
                    setattr(obj, oper, counter)
                    es_actions.append({
                        '_op_type': 'update',
                        '_index': view,
                        '_type': view[:-1],
                        '_id': obj_id,
                        'doc': {
                            oper: counter
                        }
                    })
                    if oper == DOWNLOADS_COUNT_PREFIX and hasattr(
                            obj, 'dataset_id'):
                        if obj.dataset_id not in datasets_downloads:
                            datasets_downloads[
                                obj.dataset_id] = obj.dataset.downloads_count
                        datasets_downloads[obj.dataset_id] += incr_val

                    obj.save()
                    self.con.delete(k)

        for dataset_id, counter in datasets_downloads.items():
            es_actions.append({
                '_op_type': 'update',
                '_index': 'datasets',
                '_type': 'dataset',
                '_id': dataset_id,
                'doc': {
                    DOWNLOADS_COUNT_PREFIX: counter
                }
            })

        es_bulk(connections.get_connection(), actions=es_actions)
Ejemplo n.º 35
0
def add_citation_counts(chunk_size=500, request_timeout=40):
    index, doc_type = schema_to_index('records/hep.json')

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            try:
                uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid
                yield {'_op_type': 'update',
                       '_index': index,
                       '_type': doc_type,
                       '_id': str(uuid),
                       'doc': {'citation_count': citation_count}
                       }
            except NoResultFound:
                continue

    click.echo("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    with click.progressbar(es_scan(
            current_search_client,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            # update lookup dictionary based on references of the record
            if 'references' in record['_source']:
                unique_refs_ids = set()
                references = record['_source']['references']
                for reference in references:
                    recid = reference.get('recid')
                    if recid:
                        if isinstance(recid, list):
                            # Sometimes there is more than one recid in the
                            # reference.
                            recid = recid.pop()
                        unique_refs_ids.add(recid)

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1

    click.echo("... DONE.")
    click.echo("Adding citation numbers...")

    success, failed = es_bulk(
        current_search_client,
        get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True)
    click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
Ejemplo n.º 36
0
 def bulk(self, body):
     bulk_actions = self._prepare_bulk(body)
     es_bulk(self.backend,
             bulk_actions,
             request_timeout=60,
             chunk_size=1000)