def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': { 'citation_count': citation_count } } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar( es_scan(es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list( list( chain.from_iterable( map(force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo( '... DONE: {} records updated with success. {} failures.'.format( success, failed))
def all_type_ids(self, doc_type): # results = self.es.search( # index=self.es_index, doc_type=doc_type, _source=False) # return [int(x['_id']) for x in results['hits']['hits']] results = es_scan( client=self.es, index=self.es_index, doc_type=doc_type, _source=False, scroll=10, query={}) return [int(x['_id']) for x in results]
def add_citation_counts(): from elasticsearch.helpers import bulk as es_bulk from elasticsearch.helpers import scan as es_scan from collections import Counter def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): yield {'_op_type': 'update', '_index': 'hep', '_type': 'record', '_id': recid, 'doc': {'citation_count': citation_count} } logger.info("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() for i, record in enumerate(es_scan( es, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index="hep", doc_type="record")): # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 if (i + 1) % LARGE_CHUNK_SIZE == 0: logger.info("Extracted citations from {} records".format(i + 1)) logger.info("... DONE.") logger.info("Adding citation numbers...") success, failed = es_bulk(es, get_records_to_update_generator(citations_lookup), raise_on_exception=False, raise_on_error=False, stats_only=True) logger.info("... DONE: {} records updated with success. {} failures.".format(success, failed))
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar(es_scan( es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list(list(chain.from_iterable(map( force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo('... DONE: {} records updated with success. {} failures.'.format( success, failed))
def scan_all(self, scroll = '5m', #TODO - hard coded timeout. ): """ Most efficient way to scan all documents. """ rr = es_scan(client = self.es, index = self.index_name, doc_type = self.doc_type, scroll = scroll, query = {"query": {'match_all': {}}}, ) return rr
def list_documents( self, index: str, query: Dict[str, Any] = None ) -> Iterable[Dict[str, Any]]: """ List ALL documents of an elasticsearch index Parameters ---------- index: The index name query: The es query for filter results. Default: None Returns ------- A sequence of documents resulting from applying the query on the index """ return es_scan(self.__client__, query=query or {}, index=index)
def scan_all( self, scroll='5m', #TODO - hard coded timeout. ): """ Most efficient way to scan all documents. """ rr = es_scan( client=self.es, index=self.index_name, doc_type=self.doc_type, scroll=scroll, query={"query": { 'match_all': {} }}, ) return rr
def add_citation_counts(chunk_size=500, request_timeout=40): index, doc_type = schema_to_index('records/hep.json') def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): try: uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid yield {'_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } except NoResultFound: continue click.echo("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() with click.progressbar(es_scan( current_search_client, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo("... DONE.") click.echo("Adding citation numbers...") success, failed = es_bulk( current_search_client, get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True) click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
def delete_all_documents(self): scroll = "5m" docs = es_scan(self.client, query={}, index=self.index_name, doc_type=self.doc_type) for d in docs: self.client.delete(index=d["_index"], doc_type=d["_type"], id=d["_id"])
def sync(self): """ Syncs ElasticSearch and Cassandra in both directions. It uses each database conflict resolution properties to avoid race conditions with concurrent writes running during the sync. This is: * Cassandra LWW (last write wins) * ElasticSearch doc versions The function also prefers to let the other Database decides when to insert/replace the document or not using the same semantics as above. This is MUCH faster than querying for the records and then decide to send insert/updates or do nothing. The drawback is that in C* it's impossible to distinguish if the doc was updated or not. """ # TODO: paralyze C* scanning by going through different parts of the ring in different machines/processes # TODO: paralyze ES scanning by scanning different ranges of ids in different machines/processes last_checkpoint = self.checkpoint_load() next_checkpoint = int(time.time()) # sync from ES to C* docs = [] if not last_checkpoint: es_cursor = es_scan(self.es_session, index=self.es_index, doc_type=self.es_type, query=self.make_es_filter_all()) else: es_cursor = es_scan(self.es_session, index=self.es_index, doc_type=self.es_type, query=self.make_es_filter_version_range( last_checkpoint, next_checkpoint)) for hit in es_cursor: docs.append(hit["_source"]) if len(docs) >= self.batch_size: successful, failed = self.ca_batch_insert_with_ts(docs) logger.info( "ElasticSearch -> Cassandra: %d successful or up to date, %d failed", successful, failed) docs = [] if docs: successful, failed = self.ca_batch_insert_with_ts(docs) logger.info( "ElasticSearch -> Cassandra: %d successful or up to date, %d failed", successful, failed) # sync from C* to ES docs = [] ca_cursor = self.ca_session.execute(self.ca_ps_select_all) for doc in ca_cursor: if last_checkpoint and (doc["version"] < last_checkpoint or doc["version"] > next_checkpoint): continue docs.append(doc) if len(docs) >= self.batch_size: successful, failed = self.es_bulk_insert_versioned(docs) logger.info( "Cassandra -> ElasticSearch: %d successful, %d failed or up to date", successful, failed) docs = [] if docs: successful, failed = self.es_bulk_insert_versioned(docs) logger.info( "Cassandra -> ElasticSearch: %d successful, %d failed or up to date", successful, failed) self.checkpoint_save(next_checkpoint)
def add_citation_counts(chunk_size=500, request_timeout=10): index, doc_type = schema_to_index('records/hep.json') def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): try: uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid yield {'_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } except NoResultFound: continue click.echo("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() with click.progressbar(es_scan( current_search_client, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo("... DONE.") click.echo("Adding citation numbers...") success, failed = es_bulk( current_search_client, get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True) click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))