def migrate_chunk(chunk, skip_files=False): models_committed.disconnect(index_after_commit) index_queue = [] try: for raw_record in chunk: with db.session.begin_nested(): record = migrate_and_insert_record( raw_record, skip_files=skip_files, ) if record: index_queue.append(create_index_op(record)) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, ) models_committed.connect(index_after_commit)
def migrate_chunk(chunk): models_committed.disconnect(receive_after_model_commit) current_collections.unregister_signals() index_queue = [] try: for raw_record in chunk: with db.session.begin_nested(): record = migrate_and_insert_record(raw_record) if record: index_queue.append(create_index_op(record)) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, ) models_committed.connect(receive_after_model_commit) current_collections.register_signals()
def bulk(iterable, index=INDEX_NAME, doc_type=DOC_TYPE, action='index'): """ Wrapper of elasticsearch's bulk method Converts an interable of models to document operations and submits them to Elasticsearch. Returns a count of operations when done. https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html """ actions = compact( dict_to_op( to_dict(model), index_name=INDEX_NAME, doc_type=DOC_TYPE, op_type=action, ) for model in iterable) # fail fast if there are no actions if not actions: return 0 items, _ = es_bulk(es_conn, actions, doc_type=doc_type, index=index) return items
def consumerStats(self): consumers = self.get_data('/api/consumers') consumer_count = len(consumers) consumer_stats = { '@timestamp': arrow.utcnow().format('YYYY-MM-DDTHH:mm:ssZ'), 'rabbit_connection': self.conn_name, 'consumers_current': consumer_count } es.index(index=get_es_index(), body=consumer_stats, doc_type='consumer-stats') items = [] for consumer in consumers: for fields in BLACKLIST_FIELDS.get('consumerStats', []): consumer = delete_keys_from_dict(consumer, fields) consumer.update({ '@timestamp': arrow.utcnow().format('YYYY-MM-DDTHH:mm:ssZ'), 'rabbit_connection': self.conn_name, '_index': get_es_index(), '_type': 'consumer-stats' }) items.append(consumer) indexit = es_bulk(es, items) logger.info( "All done with consumerStats on connection: %s, items_inserted: %s, errors: %s" % (self.conn_name, indexit[0], indexit[1]))
def index_model(label): logger.debug('index_model') Model = None SerializerClass = None try: Model = apps.get_model(label) except LookupError as e: logger.error(e) raise e try: SerializerClass = search_config.get_serializer_for_model(label) except LookupError as e: logger.error(e) raise e if Model and SerializerClass: serializer = SerializerClass() conn = connections.get_connection() # Get default connection queryset = Model.objects.all() if hasattr(queryset, 'published'): queryset = queryset.published() if serializer.related_object_fields: queryset = queryset.prefetch_related(*serializer.related_object_fields) model_docs = (serializer.create_document(item) for item in queryset) doc_dicts = (doc.to_dict(include_meta=True) for doc in model_docs) return es_bulk(conn, doc_dicts)
def index_model(label): logger.debug('index_model') Model = None SerializerClass = None try: Model = apps.get_model(label) except LookupError as e: logger.error(e) raise e try: SerializerClass = search_config.get_serializer_for_model(label) except LookupError as e: logger.error(e) raise e if Model and SerializerClass: serializer = SerializerClass() conn = connections.get_connection() # Get default connection queryset = Model.objects.all() if hasattr(queryset, 'published'): queryset = queryset.published() if serializer.related_object_fields: queryset = queryset.prefetch_related( *serializer.related_object_fields) model_docs = (serializer.create_document(item) for item in queryset) doc_dicts = (doc.to_dict(include_meta=True) for doc in model_docs) return es_bulk(conn, doc_dicts)
def bulk(ops, **kwargs): """ A wrapper for elasticsearch.helpers.bulk() that waits for a yellow cluster and uses our ES client. """ wait_for_yellow() return es_bulk(client, ops, **kwargs)
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': { 'citation_count': citation_count } } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar( es_scan(es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list( list( chain.from_iterable( map(force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo( '... DONE: {} records updated with success. {} failures.'.format( success, failed))
def collect_tweets( es_client, track, twitter_consumer_key, twitter_consumer_secret, twitter_access_token_key, twitter_access_token_secret, elasticsearch_index="profanity-power-index", drop_index=False, batch_size=10, ): if es_client.indices.exists(elasticsearch_index): logger.warning(f"Index {elasticsearch_index} exists.") if drop_index: logger.warning(f"Dropping {elasticsearch_index}.") es_client.indices.delete(elasticsearch_index) logger.info(f"Creating {elasticsearch_index}.") es_client.indices.create(index=elasticsearch_index, body=TWEET_MAPPING) else: logger.info(f"Creating {elasticsearch_index}.") es_client.indices.create(index=elasticsearch_index, body=TWEET_MAPPING) logger.info(f"{elasticsearch_index} successfully created.") api = twitter.Api( consumer_key=twitter_consumer_key, consumer_secret=twitter_consumer_secret, access_token_key=twitter_access_token_key, access_token_secret=twitter_access_token_secret, ) logger.info(f"Connecting to twitter stream. Tracking {', '.join(track)}.") tweet_stream = api.GetStreamFilter(track=track) tweet_to_bulk = curry(_tweet_to_bulk)(elasticsearch_index) tweet_doc_stream = thread_last( tweet_stream, # Filter out tweets that don't contain profanity. (filter, _contains_profanity), # Convert the tweets to a bulk-indexable document. (map, tweet_to_bulk), # Partition for bulk writes. (partition_all, batch_size), ) logger.info(f"Sending tweets to {elasticsearch_index}.") failed = 0 succeeded = 0 logger.info(f"{failed + succeeded} tweets processed: " f"{succeeded} succeeded, {failed} failed.") # Since the doc stream is partitioned we get the tweets in batches. for tweet_batch in tweet_doc_stream: ok, fail = es_bulk(es_client, tweet_batch, stats_only=True) succeeded += ok failed += fail if (failed + succeeded) % 100 == 0: logger.info(f"{failed + succeeded} tweets processed: " f"{succeeded} succeeded, {failed} failed.")
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for record in chunk: recid = json = None try: recid, json = create_record(record, force=True, dry_run=dry_run) index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if broken_output: broken_output_fd = open(broken_output, "a") print(record, file=broken_output_fd) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()
def add_citation_counts(): from elasticsearch.helpers import bulk as es_bulk from elasticsearch.helpers import scan as es_scan from collections import Counter def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): yield {'_op_type': 'update', '_index': 'hep', '_type': 'record', '_id': recid, 'doc': {'citation_count': citation_count} } logger.info("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() for i, record in enumerate(es_scan( es, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index="hep", doc_type="record")): # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 if (i + 1) % LARGE_CHUNK_SIZE == 0: logger.info("Extracted citations from {} records".format(i + 1)) logger.info("... DONE.") logger.info("Adding citation numbers...") success, failed = es_bulk(es, get_records_to_update_generator(citations_lookup), raise_on_exception=False, raise_on_error=False, stats_only=True) logger.info("... DONE: {} records updated with success. {} failures.".format(success, failed))
def index_website(self): """Start indexing the website and writing to elasticsearch""" logger.info('Indexing website: {0}'.format(self.base_url)) def es_feeder(objects, index, doc_type): for obj in objects: logger.debug('Indexing object type={0} id={1}' .format(doc_type, obj['_id'])) yield {'_op_type': 'index', '_index': index, '_type': doc_type, '_id': obj.pop('_id'), '_source': obj} class_types = self.get_class_types() for clsdef in class_types: logger.info(u'Scanning object class: {0} "{1}"' .format(clsdef['identifier'], clsdef['name'])) # todo: put mappings for this type objects = self.scan_pages(clsdef['link']) doc_type = clsdef['identifier'] if not self.full: # Filter out already existing objects already = set(self.all_type_ids(doc_type)) logger.debug( 'Excluding from download {0} already existing objects' .format(len(already))) _objects = objects objects = (o for o in _objects if int(o['nodeId']) not in already) # Prepare objects for insertion.. objects = (self.process_object(doc_type, raw_obj) for raw_obj in objects) actions = es_feeder(objects, self.es_index, doc_type) es_bulk(self.es, actions=actions, chunk_size=50) self.es.indices.flush()
def migrate_chunk(chunk): index_queue = [] try: for raw_record in chunk: with db.session.begin_nested(): record = migrate_and_insert_record(raw_record) if record: index_queue.append(create_index_op(record)) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( current_search_client, index_queue, stats_only=True, request_timeout=req_timeout, )
def migrate_chunk(chunk): index_queue = [] try: for raw_record in chunk: with db.session.begin_nested(): record = migrate_and_insert_record(raw_record) if record: index_queue.append(create_index_op(record)) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, )
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar(es_scan( es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list(list(chain.from_iterable(map( force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo('... DONE: {} records updated with success. {} failures.'.format( success, failed))
def create_metric_index_to_es(self, metric): metric_node_list = metric.split('.') actions = [] for i in range(0, len(metric_node_list)): if i: name = '.'.join(metric_node_list[:i+1]) parent = '.'.join(metric_node_list[:i]) else: name, parent = metric_node_list[i], '' node = metric_node_list[i] leaf = 0 tag = {} if name == metric: leaf = 1 if name.find(';') > -1: tag = {kv.split(':')[0]: kv.split(':')[1] for kv in name.split(';')[1:]} body = {'query': {'term': {'name': name}}} res = self.es.search(index='metric', doc_type='_doc', body=body) if not res.get('hits', {}).get('hits'): action = { '_index': self.es_index, '_id': name, '_source': { 'name': name, 'hash': hashlib.md5(name.encode(encoding='utf-8')).hexdigest(), 'alias': '', 'path': name.split(';')[0] if tag else name, 'parent': parent, 'text': node, 'leaf': leaf, 'tag': tag } } actions.append(action) if actions: es_bulk(self.es, actions)
def migrate_chunk(chunk): index_queue = [] try: for raw_record in chunk: record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record json_record = create_record(record) with db.session.begin_nested(): try: record = record_upsert(json_record) except ValidationError as e: # Invalid record, will not get indexed errors = "ValidationError: Record {0}: {1}".format( recid, e ) current_app.logger.warning(errors) prod_record.valid = False prod_record.errors = errors db.session.merge(prod_record) continue index_queue.append(create_index_op(record)) prod_record.valid = True db.session.merge(prod_record) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( current_search_client, index_queue, stats_only=True, request_timeout=req_timeout, )
def bulk( actions=None, chunk_size=500, max_chunk_bytes=settings.ES_BULK_MAX_CHUNK_BYTES, **kwargs, ): """Send data in bulk to Elasticsearch.""" return es_bulk( get_client(), actions=actions, chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, **kwargs, )
def migrate_recids_from_mirror(prod_recids, skip_files=False): models_committed.disconnect(index_after_commit) index_queue = [] for recid in prod_recids: with db.session.begin_nested(): record = migrate_record_from_mirror( LegacyRecordsMirror.query.get(recid), skip_files=skip_files, ) if record and not record.get('deleted'): index_queue.append(create_index_op(record)) db.session.commit() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, ) models_committed.connect(index_after_commit)
def es_bulk_insert_versioned(self, docs): """" Insert docs in the corresponding ElasticSearch index using the bulk method :param docs: list of dicts representing documents :return: tuple of (num successful, num failed or up to date) writes """ return es_bulk(self.es_session, ({ "_index": self.es_index, "_type": self.es_type, "_id": doc["id"], "_version": doc["version"], "_version_type": "external", "_source": doc } for doc in docs), stats_only=True)
def add_documents( self, index: str, documents: List[Dict[str, Any]], routing: Callable[[Dict[str, Any]], str] = None, doc_id: Callable[[Dict[str, Any]], str] = None, ) -> int: """ Adds or updated a set of documents to an index. Documents can contains partial information of document. See <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html> Parameters ---------- index: The index name documents: The set of documents routing: The routing key doc_id Returns ------- The number of failed documents """ def map_doc_2_action(doc: Dict[str, Any]) -> Dict[str, Any]: """Configures bulk action""" return { "_op_type": "update", "_index": index, "_id": doc_id(doc) if doc_id else doc["_id"], "_routing": routing(doc) if routing else None, "doc": doc, "doc_as_upsert": True, } success, failed = es_bulk( self.__client__, index=index, actions=map(map_doc_2_action, documents), raise_on_error=True, refresh="wait_for", ) return len(failed)
def BulkIndexRecords(self, records): ''' Bulk Index Records IN self: EsHandler records: a list of records to bulk index ''' logging.debug('[starting] Indexing Bulk Records') success_count, failed_items = es_bulk(self.esh, records, chunk_size=10000, raise_on_error=False) if len(failed_items) > 0: logging.error('{} index errors'.format(len(failed_items))) index_error_file = open("IndexErrors.txt", "a+") index_error_file.write(str(failed_items) + "\n") index_error_file.close() logging.debug('[finished] Indexing Bulk Records')
def BulkIndexRecords(self, records): ''' Bulk Index Records IN self: EsHandler records: a list of records to bulk index ''' ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records') success_count, failed_items = es_bulk(self.esh, records, chunk_size=10000, raise_on_error=False) if len(failed_items) > 0: ELASTIC_LOGGER.error('[PID {}] {} index errors'.format( os.getpid(), len(failed_items))) for failed_item in failed_items: ELASTIC_LOGGER.error(unicode(failed_item)) ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')
def queueStats(self): queues = self.get_data('/api/queues') if queues is not None: items = [] for queue in queues: for fields in BLACKLIST_FIELDS.get('queueStats', []): delete_keys_from_dict(queue, fields) es_stuff = { '@timestamp': arrow.utcnow().format('YYYY-MM-DDTHH:mm:ssZ'), 'rabbit_connection': self.conn_name, '_index': get_es_index(), '_type': 'queue-stats', } es_stuff.update(queue) items.append(es_stuff) indexit = es_bulk(es, items) logger.info( "All done with queueStats on connection: %s, items_inserted: %s, errors: %s" % (self.conn_name, indexit[0], indexit[1]))
def BulkIndexRecords(self,records): ''' Bulk Index Records IN self: EsHandler records: a list of records to bulk index ''' logging.debug('[starting] Indexing Bulk Records') success_count,failed_items = es_bulk( self.esh, records, chunk_size=10000, raise_on_error=False ) if len(failed_items) > 0: logging.error('{} index errors'.format(len(failed_items))) index_error_file = open("IndexErrors.txt", "a+") index_error_file.write(str(failed_items)+"\n") index_error_file.close() logging.debug('[finished] Indexing Bulk Records')
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client): data_dir = utils.get_dir_for('transform02') batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format( domain=domain, batch_id=batch_id ) data_file = open(data_dir + '/' + batch_filename, 'r') run_id = data_file.readline().rstrip() events = json.loads(data_file.readline()) if dry_run: actions = [{ '_index': script_settings.ES_INDEX, '_type': domain + '-pageviews', '_source': event, } for event in events] stats = es_bulk( client=es_client, stats_only=True, actions=actions, ) tally[domain + '-' + str(batch_id)] = stats else: keen_client.add_events({'pageviews': events})
def BulkIndexRecords(self,records): ''' Bulk Index Records IN self: EsHandler records: a list of records to bulk index ''' ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records') success_count,failed_items = es_bulk( self.esh, records, chunk_size=10000, raise_on_error=False ) if len(failed_items) > 0: ELASTIC_LOGGER.error('[PID {}] {} index errors'.format( os.getpid(),len(failed_items) )) for failed_item in failed_items: ELASTIC_LOGGER.error(unicode(failed_item)) ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client): data_dir = utils.get_dir_for('transform02') batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format( domain=domain, batch_id=batch_id) events = [] with open(data_dir + '/' + batch_filename, 'r') as data_file: run_id = data_file.readline().rstrip() events = json.loads(data_file.readline()) if dry_run: actions = [{ '_index': script_settings.ES_INDEX, '_type': domain + '-pageviews', '_source': event, } for event in events] stats = es_bulk( client=es_client, stats_only=True, actions=actions, ) tally[domain + '-' + str(batch_id)] = stats else: keen_client.add_events({'pageviews': events})
def bulk(iterable, index=INDEX_NAME, doc_type=DOC_TYPE, action='index'): """ Wrapper of elasticsearch's bulk method Converts an interable of models to document operations and submits them to Elasticsearch. Returns a count of operations when done. https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html """ actions = compact(dict_to_op( to_dict(model), index_name=INDEX_NAME, doc_type=DOC_TYPE, op_type=action, ) for model in iterable) # fail fast if there are no actions if not actions: return 0 items, _ = es_bulk(es_conn, actions, doc_type=doc_type, index=index) return items
def flush(self): """flush buffer""" res = es_bulk(self.esclient, self.buf) self.buf = [] return res
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for raw_record in chunk: json = None record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) if not dry_run: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: with db.session.begin_nested(): errors, recid, json = create_record( recid, record, force=True, dry_run=dry_run, validation=True ) if dry_run: continue prod_record.valid = not errors prod_record.errors = errors index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) prod_record.successful = True db.session.merge(prod_record) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if not dry_run: prod_record.successful = False db.session.merge(prod_record) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") if not dry_run: es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()
def add_citation_counts(chunk_size=500, request_timeout=10): index, doc_type = schema_to_index('records/hep.json') def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): try: uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid yield {'_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } except NoResultFound: continue click.echo("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() with click.progressbar(es_scan( current_search_client, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo("... DONE.") click.echo("Adding citation numbers...") success, failed = es_bulk( current_search_client, get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True) click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
def save_counters(self): """ Tworzy lub uaktualnia liczniki w bazie danych i elasticsearch'u na podstawie wartości zapisanych w redis od ostatniej aktualizacji. Metoda powinna być wołana jako zadanie przez CRON. """ es_actions = [] datasets_downloads = {} for oper in (VIEWS_COUNT_PREFIX, DOWNLOADS_COUNT_PREFIX): last_save = self._get_last_save(oper) self.con.set(f'{oper}_last_save', str(int(datetime.datetime.now().timestamp()))) for view in settings.COUNTED_VIEWS: model_name = view[:-1].title() model = apps.get_model(view, model_name) model.is_indexable = False for k in self.con.scan_iter(f'{oper}:{last_save}:{view}:*'): obj_id = int(k.decode().split(':')[-1]) try: obj = model.objects.get(pk=obj_id) except model.DoesNotExist: self.con.delete(k) continue incr_val = int(self.con.get(k)) counter = getattr(obj, oper) + incr_val setattr(obj, oper, counter) es_actions.append({ '_op_type': 'update', '_index': view, '_type': view[:-1], '_id': obj_id, 'doc': { oper: counter } }) if oper == DOWNLOADS_COUNT_PREFIX and hasattr( obj, 'dataset_id'): if obj.dataset_id not in datasets_downloads: datasets_downloads[ obj.dataset_id] = obj.dataset.downloads_count datasets_downloads[obj.dataset_id] += incr_val obj.save() self.con.delete(k) for dataset_id, counter in datasets_downloads.items(): es_actions.append({ '_op_type': 'update', '_index': 'datasets', '_type': 'dataset', '_id': dataset_id, 'doc': { DOWNLOADS_COUNT_PREFIX: counter } }) es_bulk(connections.get_connection(), actions=es_actions)
def add_citation_counts(chunk_size=500, request_timeout=40): index, doc_type = schema_to_index('records/hep.json') def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): try: uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid yield {'_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } except NoResultFound: continue click.echo("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() with click.progressbar(es_scan( current_search_client, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo("... DONE.") click.echo("Adding citation numbers...") success, failed = es_bulk( current_search_client, get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True) click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
def bulk(self, body): bulk_actions = self._prepare_bulk(body) es_bulk(self.backend, bulk_actions, request_timeout=60, chunk_size=1000)