Beispiel #1
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {
                        'citation_count': citation_count
                    }
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(
            es_scan(es,
                    query={
                        '_source': 'references.recid',
                        'filter': {
                            'exists': {
                                'field': 'references.recid'
                            }
                        },
                        'size': LARGE_CHUNK_SIZE
                    },
                    scroll=u'2m',
                    index=index,
                    doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(
                list(
                    chain.from_iterable(
                        map(force_list,
                            get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo(
        '... DONE: {} records updated with success. {} failures.'.format(
            success, failed))
 def all_type_ids(self, doc_type):
     # results = self.es.search(
     #     index=self.es_index, doc_type=doc_type, _source=False)
     # return [int(x['_id']) for x in results['hits']['hits']]
     results = es_scan(
         client=self.es, index=self.es_index, doc_type=doc_type, _source=False,
         scroll=10, query={})
     return [int(x['_id']) for x in results]
Beispiel #3
0
def add_citation_counts():
    from elasticsearch.helpers import bulk as es_bulk
    from elasticsearch.helpers import scan as es_scan
    from collections import Counter

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            yield {'_op_type': 'update',
                   '_index': 'hep',
                   '_type': 'record',
                   '_id': recid,
                   'doc': {'citation_count': citation_count}
                   }

    logger.info("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    for i, record in enumerate(es_scan(
            es,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index="hep",
            doc_type="record")):

        # update lookup dictionary based on references of the record
        if 'references' in record['_source']:
            unique_refs_ids = set()
            references = record['_source']['references']
            for reference in references:
                recid = reference.get('recid')
                if recid:
                    if isinstance(recid, list):
                        # Sometimes there is more than one recid in the
                        # reference.
                        recid = recid.pop()
                    unique_refs_ids.add(recid)

        for unique_refs_id in unique_refs_ids:
            citations_lookup[unique_refs_id] += 1

        if (i + 1) % LARGE_CHUNK_SIZE == 0:
            logger.info("Extracted citations from {} records".format(i + 1))

    logger.info("... DONE.")
    logger.info("Adding citation numbers...")

    success, failed = es_bulk(es, get_records_to_update_generator(citations_lookup), raise_on_exception=False, raise_on_error=False, stats_only=True)
    logger.info("... DONE: {} records updated with success. {} failures.".format(success, failed))
Beispiel #4
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {'citation_count': citation_count}
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(es_scan(
            es,
            query={
                '_source': 'references.recid',
                'filter': {
                    'exists': {
                        'field': 'references.recid'
                    }
                },
                'size': LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(list(chain.from_iterable(map(
                force_list, get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo('... DONE: {} records updated with success. {} failures.'.format(
        success, failed))
    def scan_all(self,
                 scroll = '5m', #TODO - hard coded timeout.
                 ):
        """
        Most efficient way to scan all documents.
        """

        rr = es_scan(client = self.es,
                     index = self.index_name,
                     doc_type = self.doc_type,
                     scroll = scroll,
                     query = {"query": {'match_all': {}}},
                     )

        return rr
Beispiel #6
0
    def list_documents(
        self, index: str, query: Dict[str, Any] = None
    ) -> Iterable[Dict[str, Any]]:
        """
        List ALL documents of an elasticsearch index
        Parameters
        ----------
        index:
            The index name
        query:
            The es query for filter results. Default: None

        Returns
        -------
        A sequence of documents resulting from applying the query on the index

        """
        return es_scan(self.__client__, query=query or {}, index=index)
Beispiel #7
0
    def scan_all(
            self,
            scroll='5m',  #TODO - hard coded timeout.
    ):
        """
        Most efficient way to scan all documents.
        """

        rr = es_scan(
            client=self.es,
            index=self.index_name,
            doc_type=self.doc_type,
            scroll=scroll,
            query={"query": {
                'match_all': {}
            }},
        )

        return rr
Beispiel #8
0
def add_citation_counts(chunk_size=500, request_timeout=40):
    index, doc_type = schema_to_index('records/hep.json')

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            try:
                uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid
                yield {'_op_type': 'update',
                       '_index': index,
                       '_type': doc_type,
                       '_id': str(uuid),
                       'doc': {'citation_count': citation_count}
                       }
            except NoResultFound:
                continue

    click.echo("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    with click.progressbar(es_scan(
            current_search_client,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            # update lookup dictionary based on references of the record
            if 'references' in record['_source']:
                unique_refs_ids = set()
                references = record['_source']['references']
                for reference in references:
                    recid = reference.get('recid')
                    if recid:
                        if isinstance(recid, list):
                            # Sometimes there is more than one recid in the
                            # reference.
                            recid = recid.pop()
                        unique_refs_ids.add(recid)

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1

    click.echo("... DONE.")
    click.echo("Adding citation numbers...")

    success, failed = es_bulk(
        current_search_client,
        get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True)
    click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
 def delete_all_documents(self):
     scroll = "5m"
     docs = es_scan(self.client, query={}, index=self.index_name, doc_type=self.doc_type)
     for d in docs:
         self.client.delete(index=d["_index"], doc_type=d["_type"], id=d["_id"])
Beispiel #10
0
    def sync(self):
        """
        Syncs ElasticSearch and Cassandra in both directions.

        It uses each database conflict resolution properties to avoid race conditions
        with concurrent writes running during the sync. This is:
            * Cassandra LWW (last write wins)
            * ElasticSearch doc versions

        The function also prefers to let the other Database decides when to insert/replace the document or not
        using the same semantics as above. This is MUCH faster than querying for the records and then decide to
        send insert/updates or do nothing.
        The drawback is that in C* it's impossible to distinguish if the doc was updated or not.
        """
        # TODO: paralyze C* scanning by going through different parts of the ring in different machines/processes
        # TODO: paralyze ES scanning by scanning different ranges of ids in different machines/processes

        last_checkpoint = self.checkpoint_load()
        next_checkpoint = int(time.time())

        # sync from ES to C*
        docs = []
        if not last_checkpoint:
            es_cursor = es_scan(self.es_session,
                                index=self.es_index,
                                doc_type=self.es_type,
                                query=self.make_es_filter_all())
        else:
            es_cursor = es_scan(self.es_session,
                                index=self.es_index,
                                doc_type=self.es_type,
                                query=self.make_es_filter_version_range(
                                    last_checkpoint, next_checkpoint))
        for hit in es_cursor:
            docs.append(hit["_source"])
            if len(docs) >= self.batch_size:
                successful, failed = self.ca_batch_insert_with_ts(docs)
                logger.info(
                    "ElasticSearch -> Cassandra: %d successful or up to date, %d failed",
                    successful, failed)
                docs = []
        if docs:
            successful, failed = self.ca_batch_insert_with_ts(docs)
            logger.info(
                "ElasticSearch -> Cassandra: %d successful or up to date, %d failed",
                successful, failed)

        # sync from C* to ES
        docs = []
        ca_cursor = self.ca_session.execute(self.ca_ps_select_all)
        for doc in ca_cursor:
            if last_checkpoint and (doc["version"] < last_checkpoint
                                    or doc["version"] > next_checkpoint):
                continue
            docs.append(doc)
            if len(docs) >= self.batch_size:
                successful, failed = self.es_bulk_insert_versioned(docs)
                logger.info(
                    "Cassandra -> ElasticSearch: %d successful, %d failed or up to date",
                    successful, failed)
                docs = []
        if docs:
            successful, failed = self.es_bulk_insert_versioned(docs)
            logger.info(
                "Cassandra -> ElasticSearch: %d successful, %d failed or up to date",
                successful, failed)

        self.checkpoint_save(next_checkpoint)
Beispiel #11
0
def add_citation_counts(chunk_size=500, request_timeout=10):
    index, doc_type = schema_to_index('records/hep.json')

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            try:
                uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid
                yield {'_op_type': 'update',
                       '_index': index,
                       '_type': doc_type,
                       '_id': str(uuid),
                       'doc': {'citation_count': citation_count}
                       }
            except NoResultFound:
                continue

    click.echo("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    with click.progressbar(es_scan(
            current_search_client,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            # update lookup dictionary based on references of the record
            if 'references' in record['_source']:
                unique_refs_ids = set()
                references = record['_source']['references']
                for reference in references:
                    recid = reference.get('recid')
                    if recid:
                        if isinstance(recid, list):
                            # Sometimes there is more than one recid in the
                            # reference.
                            recid = recid.pop()
                        unique_refs_ids.add(recid)

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1

    click.echo("... DONE.")
    click.echo("Adding citation numbers...")

    success, failed = es_bulk(
        current_search_client,
        get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True)
    click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))