def run(self): timestamp = datetime.utcnow() sickle = Sickle('http://invenio.nusl.cz/oai2d/') sickle.class_mapping['ListRecords'] = MarcXMLParser sickle.class_mapping['GetRecord'] = MarcXMLParser oai_logger.info("Loading records") records = sickle.ListRecords(metadataPrefix='marcxml') for idx, record in enumerate(records): print(f"{idx}. {record.id}") oai_logger.info(f"{idx}. {record.id}") try: current_search_client.index( index=self.index, id=record.marc_dict["001"], body=record.marc_dict ) except: exc_traceback = traceback.format_exc() print(exc_traceback) print("\n\n\n") file_name = f'{timestamp.strftime("%Y%m%dT%H%M%S")}.err' file_path = os.path.join(self.path, file_name) with open(file_path, "a") as f: f.write( f"Dictionary: {record.marc_dict}\n\n" f"{exc_traceback}\n\n\n\n") continue
def _update_status_in_doc(cls, record, es_item): """Update the status of a given item in the document index. :param record: an item object :param es_item: a dict of the elasticsearch item """ # retrieve the document in the corresponding es index document_pid = extracted_data_from_ref(record.get('document')) doc = next(DocumentsSearch().extra(version=True).filter( 'term', pid=document_pid).scan()) # update the item status in the document data = doc.to_dict() for hold in data.get('holdings', []): for item in hold.get('items', []): if item['pid'] == record.pid: item['status'] = record['status'] break else: continue break # reindex the document with the same version current_search_client.index(index=DocumentsSearch.Meta.index, id=doc.meta.id, body=data, version=doc.meta.version, version_type='external_gte')
def orcid_test(mock_user, request): """Orcid test fixture.""" app = mock_user.app def teardown(app): with app.app_context(): es.delete(index='records-authors', doc_type='authors', id=10) record = { "name": { "status": "ACTIVE", "preferred_name": "Full Name", "value": "Full Name" }, "$schema": "http://localhost:5000/schemas/records/authors.json", "control_number": "10", "self": { "$ref": "http://localhost:5000/api/authors/10" }, "ids": [{ "type": "INSPIRE", "value": "INSPIRE-0000000" }, { "type": "ORCID", "value": "0000-0001-9412-8627" }], "self_recid": 10, "earliest_date": "2015-09-23" } request.addfinalizer(lambda: teardown(app)) with app.app_context(): es.index(index='records-authors', doc_type='authors', id=10, body=record) es.indices.refresh('records-authors') record = get_db_record('literature', 782466) record['authors'].append({ u'affiliations': [{ u'value': u'St. Petersburg, INP' }], u'curated_relation': True, u'full_name': u'Full, Name', u'profile': { u'__url__': u'http://inspirehep.net/record/00000000' }, u'record': { u'$ref': u'http://localhost:5000/api/authors/10' } }) mock_orcid_api = OrcidApiMock(1) return mock_orcid_api, record
def test_appoint_profile_from_claimed_signature(small_app): """Check the module for the case where claimed signature takes everything. """ from inspirehep.modules.disambiguation.tasks import disambiguation_clustering, update_authors_recid old_record_id = str(PersistentIdentifier.get("literature", 11883).object_uuid) old_record = get_es_record_by_uuid(old_record_id) old_author_uuid = old_record["authors"][0]["uuid"] # Add phonetic block to the record. old_record["authors"][0]["signature_block"] = "HAGp" old_record["authors"][0]["recid"] = "2" es.index(index="records-hep", doc_type="hep", id=old_record_id, body=old_record) es.indices.refresh("records-hep") record_id = str(PersistentIdentifier.get("literature", 1358492).object_uuid) record = get_es_record_by_uuid(record_id) author_uuid = record["authors"][0]["uuid"] # Add phonetic block to the record. record["authors"][0]["signature_block"] = "HAGp" record["authors"][0]["recid"] = "314159265" record["authors"][0]["curated_relation"] = True es.index(index="records-hep", doc_type="hep", id=record_id, body=record) es.indices.refresh("records-hep") with patch("celery.current_app.send_task", return_value=_BeardObject(({"2": [old_author_uuid, author_uuid]}, {}))): with patch( "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid ): disambiguation_clustering("HAGp") assert Record.get_record(old_record_id)["authors"][0]["recid"] == "314159265" assert Record.get_record(record_id)["authors"][0]["recid"] == "314159265"
def index_record_modification(sender, changes): """Example handler for indexing record metadata.""" for obj, change in changes: if isinstance(obj, RecordMetadata): if change in ("insert", "update"): current_search_client.index(index="records", doc_type="record", id=obj.id, body=obj.json) elif change in ("delete"): current_search_client.delete(index="records", doc_type="record", id=obj.id)
def index(self, index_name=None, doc_type=None): """Index the workflow record into desired index/doc_type.""" config = current_app.config['WORKFLOWS_UI_DATA_TYPES'].get( self["_workflow"]["data_type"] ) if config or (index_name and doc_type): current_search_client.index( id=str(self['id']), index=index_name or config.get('search_index'), doc_type=doc_type or config.get('search_type'), body=self.dumps(), )
def fixtures(): """Example fixtures.""" # Index sample records current_search_client.index( index='demo-default-v1.0.0', body={'title': 'Public', 'body': 'test 1', 'public': 1}, doc_type='example' if ES_VERSION[0] < 7 else '_doc' ) current_search_client.index( index='demo-default-v1.0.0', body={'title': 'Private', 'body': 'test 2', 'public': 0}, doc_type='example' if ES_VERSION[0] < 7 else '_doc' )
def _new_percolator(spec, search_pattern): """Create new percolator associated with the new set.""" if spec and search_pattern: query = query_string_parser(search_pattern=search_pattern).to_dict() for index in current_search.mappings.keys(): # Create the percolator doc_type in the existing index for >= ES5 # TODO: Consider doing this only once in app initialization percolator_doc_type = _get_percolator_doc_type(index) _create_percolator_mapping(index, percolator_doc_type) current_search_client.index(index=index, doc_type=percolator_doc_type, id='oaiset-{}'.format(spec), body={'query': query})
def _new_percolator(spec, search_pattern): """Create new percolator associated with the new set.""" if spec and search_pattern: query = query_string_parser(search_pattern=search_pattern).to_dict() for index in current_search.mappings.keys(): # Create the percolator doc_type in the existing index for >= ES5 # TODO: Consider doing this only once in app initialization percolator_doc_type = _get_percolator_doc_type(index) _create_percolator_mapping(index, percolator_doc_type) current_search_client.index( index=index, doc_type=percolator_doc_type, id='oaiset-{}'.format(spec), body={'query': query} )
def index_documents(docs, bulk=False): """Index a list of documents into ES.""" if bulk: bulk_index( client=current_search_client, actions=docs, index='relationships', doc_type='doc', ) else: for doc in docs: current_search_client.index(index='relationships', doc_type='doc', body=doc)
def index_documents(docs: Iterable[dict], bulk: bool = False): """Index a list of documents into ES.""" if bulk: bulk_index( client=current_search_client, actions=docs, index='relationships', doc_type='doc', raise_on_error=False, ) else: for doc in docs: current_search_client.index(index='relationships', doc_type='doc', body=doc)
def index_record_dict(record_dict, doc_type, recid, index=None, parent=None): """ Index a given document :param record_dict: [dict] A python dictionary containing a JSON-like structure which needs to be indexed :param doc_type: [string] type of document. "publication" or "datatable" :param index: [string] name of the index. If None a default is used :param parent: [int] record id of the potential parent :return: [dict] Response dictionary """ if parent: return es.index(index=index, doc_type=doc_type, id=recid, body=record_dict, parent=parent) else: return es.index(index=index, doc_type=doc_type, id=recid, body=record_dict)
def index(self, index_name=None, doc_type=None): """Index the workflow record into desired index/doc_type.""" config = current_app.config['WORKFLOWS_UI_DATA_TYPES'].get( self["_workflow"]["data_type"]) if config or (index_name and doc_type): try: current_search_client.index( id=str(self['id']), index=index_name or config.get('search_index'), doc_type=doc_type or config.get('search_type'), body=self.dumps(), ) except TransportError: current_app.logger.exception() current_app.logger.error( "Problem while indexing workflow object {0}".format( self.model.id))
def test_appoint_profile_from_claimed_signature(small_app): """Check the module for the case where claimed signature takes everything. """ from inspirehep.modules.disambiguation.tasks import ( disambiguation_clustering, update_authors_recid) old_record_id = str( PersistentIdentifier.get("literature", 11883).object_uuid) old_record = get_es_record_by_uuid(old_record_id) old_author_uuid = old_record['authors'][0]['uuid'] # Add phonetic block to the record. old_record['authors'][0]['signature_block'] = "HAGp" old_record['authors'][0]['recid'] = "2" es.index(index='records-hep', doc_type='hep', id=old_record_id, body=old_record) es.indices.refresh('records-hep') record_id = str( PersistentIdentifier.get("literature", 1358492).object_uuid) record = get_es_record_by_uuid(record_id) author_uuid = record['authors'][0]['uuid'] # Add phonetic block to the record. record['authors'][0]['signature_block'] = "HAGp" record['authors'][0]['recid'] = "314159265" record['authors'][0]['curated_relation'] = True es.index(index='records-hep', doc_type='hep', id=record_id, body=record) es.indices.refresh('records-hep') with patch("celery.current_app.send_task", return_value=_BeardObject(({ "2": [old_author_uuid, author_uuid] }, {}))): with patch( "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid): disambiguation_clustering("HAGp") assert Record.get_record(old_record_id)['authors'][0]['recid'] == \ "314159265" assert Record.get_record(record_id)['authors'][0]['recid'] == \ "314159265"
def index_record_modification(sender, changes): """Example handler for indexing record metadata.""" for obj, change in changes: if isinstance(obj, RecordMetadata): if change in ('insert', 'update'): current_search_client.index( index='records', doc_type='record', id=obj.id, body=obj.json, ) elif change in ('delete'): current_search_client.delete( index='records', doc_type='record', id=obj.id, )
def index_documents(docs: Iterable[dict], bulk: bool = False): """Index a list of documents into ES.""" if bulk: bulk_index( client=current_search_client, actions=docs, index='relationships', doc_type='doc', raise_on_error=False, chunk_size=300, # TODO: Make configurable max_chunk_bytes=(30 * 1024 * 1024), # TODO: Make configurable ) else: for doc in docs: current_search_client.index(index='relationships', doc_type='doc', body=doc)
def set(self, taxonomy_term: TaxonomyTerm, timestamp=None) -> None: """ Save serialized taxonomy into Elasticsearch. It create new or update old taxonomy record. :param taxonomy_term: Taxonomy term class from flask-taxonomies :type taxonomy_term: TaxonomyTerm :param timestamp: Datetime class :type timestamp: Datetime class :return: None :rtype: None """ if taxonomy_term.parent: body = get_taxonomy_term(code=taxonomy_term.taxonomy.slug, slug=taxonomy_term.slug, timestamp=timestamp) current_search_client.index(index=self.index, id=taxonomy_term.id, body=body)
def create(cls, data, id_=None, index_refresh='false', **kwargs): """Create a new record instance and store it in elasticsearch. :param data: Dict with the record metadata. :param id_: Specify a UUID to use for the new record, instead of automatically generated. :param index_refresh: If `true` then refresh the affected shards to make this operation visible to search, if `wait_for` then wait for a refresh to make this operation visible to search, if `false` (the default) then do nothing with refreshes. Valid choices: 'true', 'false', 'wait_for' :returns: A new :class:`Record` instance. """ if id_: data['pid'] = id_ record = cls(data, model=None, **kwargs) # Run pre create extensions for e in cls._extensions: e.pre_create(record) if current_app.config.get('RERO_ILS_ENABLE_OPERATION_LOG_VALIDATION'): # Validate also encodes the data # For backward compatibility we pop them here. format_checker = kwargs.pop('format_checker', None) validator = kwargs.pop('validator', None) if '$schema' not in record: record['$schema'] = current_jsonschemas.path_to_url( cls._schema) record._validate(format_checker=format_checker, validator=validator, use_model=False) current_search_client.index(index=cls.get_index(record), body=record.dumps(), id=record['pid'], refresh=index_refresh) # Run post create extensions for e in cls._extensions: e.post_create(record) return record
def index(self, index_name=None, doc_type=None): """Index the workflow record into desired index/doc_type.""" config = current_app.config['WORKFLOWS_UI_DATA_TYPES'].get( self["_workflow"]["data_type"] ) if config or (index_name and doc_type): try: current_search_client.index( id=str(self['id']), index=index_name or config.get('search_index'), doc_type=doc_type or config.get('search_type'), body=self.dumps(), ) except TransportError as err: current_app.logger.exception(err) current_app.logger.error( "Problem while indexing workflow object {0}".format( self.model.id ) )
def test_single_signature_with_no_profile(small_app): """Check the module for the case with a single, new signature.""" from inspirehep.modules.disambiguation.tasks import disambiguation_clustering, update_authors_recid record_id = str(PersistentIdentifier.get("literature", 11883).object_uuid) record = get_es_record_by_uuid(record_id) author_uuid = record["authors"][0]["uuid"] # Add phonetic block to the record. record["authors"][0]["signature_block"] = "HAGp" es.index(index="records-hep", doc_type="hep", id=record_id, body=record) es.indices.refresh("records-hep") with patch("celery.current_app.send_task", return_value=_BeardObject(({}, {"0": [author_uuid]}))): with patch( "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid ): disambiguation_clustering("HAGp") assert Record.get_record(record_id)["authors"][0]["recid"] == "1"
def orcid_test(mock_user, request): """Orcid test fixture.""" app = mock_user.app def teardown(app): with app.app_context(): es.delete(index='records-authors', doc_type='authors', id=10) record = { "name": { "status": "ACTIVE", "preferred_name": "Full Name", "value": "Full Name" }, "$schema": "http://localhost:5000/schemas/records/authors.json", "control_number": "10", "self": {"$ref": "http://localhost:5000/api/authors/10"}, "ids": [{ "type": "INSPIRE", "value": "INSPIRE-0000000" }, { "type": "ORCID", "value": "0000-0001-9412-8627" }], "self_recid": 10, "earliest_date": "2015-09-23" } request.addfinalizer(lambda: teardown(app)) with app.app_context(): es.index(index='records-authors', doc_type='authors', id=10, body=record) es.indices.refresh('records-authors') record = get_db_record('literature', 782466) record['authors'].append({u'affiliations': [{u'value': u'St. Petersburg, INP'}], u'curated_relation': True, u'full_name': u'Full, Name', u'profile': { u'__url__': u'http://inspirehep.net/record/00000000'}, u'record': {u'$ref': u'http://localhost:5000/api/authors/10'}}) mock_orcid_api = OrcidApiMock(1) return mock_orcid_api, record
def index_record_dict(record_dict, doc_type, recid, index=None, parent=None): """ Index a given document :param record_dict: [dict] A python dictionary containing a JSON-like structure which needs to be indexed :param doc_type: [string] type of document. "publication" or "datatable" :param index: [string] name of the index. If None a default is used :param parent: [int] record id of the potential parent :return: [dict] Response dictionary """ if parent: return es.index(index=index, doc_type=doc_type, id=recid, body=record_dict, parent=parent) else: return es.index(index=index, doc_type=doc_type, id=recid, body=record_dict)
def test_match_signature_with_existing_profile(small_app): """Check the module for the case with signatures and existing profile.""" from inspirehep.modules.disambiguation.tasks import ( disambiguation_clustering, update_authors_recid) old_record_id = str(PersistentIdentifier.get('lit', 11883).object_uuid) old_record = get_es_record_by_uuid(old_record_id) old_author_uuid = old_record['authors'][0]['uuid'] # Add phonetic block to the record. old_record['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=old_record_id, body=old_record) es.indices.refresh('records-hep') record_id = str(PersistentIdentifier.get('lit', 1358492).object_uuid) record = get_es_record_by_uuid(record_id) author_uuid = record['authors'][0]['uuid'] # Add phonetic block to the record. record['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=record_id, body=record) es.indices.refresh('records-hep') with patch("celery.current_app.send_task", return_value=_BeardObject(({ "1": [old_author_uuid, author_uuid] }, {}))): with patch( "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid): disambiguation_clustering("HAGp") assert InspireRecord.get_record( old_record_id)['authors'][0]['recid'] == "1" assert InspireRecord.get_record(record_id)['authors'][0]['recid'] == "1"
def index_record_modification(sender, changes): """Reset the set of processed records for the next session.""" records_to_index = flask.g.get('invenio_search_records_to_index', dict()) records_to_delete = flask.g.get('invenio_search_records_to_delete', set()) for id in records_to_index: if id not in records_to_delete: current_search_client.index( index='invenio_records_rest_test_index', doc_type='record', id=id, body=records_to_index[id].body, version=records_to_index[id].version, version_type='external_gte', ) for id in records_to_delete: current_search_client.delete( index='invenio_records_rest_test_index', doc_type='record', id=id, ) flask.g.invenio_search_records_to_index = dict() flask.g.invenio_search_records_to_delete = set()
def _new_percolator(spec, search_pattern): """Create new percolator associated with the new set.""" if spec and search_pattern: query = query_string_parser(search_pattern=search_pattern).to_dict() oai_records_index = current_app.config["OAISERVER_RECORD_INDEX"] for index, mapping_path in current_search.mappings.items(): # Skip indices/mappings not used by OAI-PMH if not index.startswith(oai_records_index): continue # Create the percolator doc_type in the existing index for >= ES5 # TODO: Consider doing this only once in app initialization try: percolator_doc_type = _get_percolator_doc_type(index) _create_percolator_mapping(index, percolator_doc_type, mapping_path) current_search_client.index( index=_build_percolator_index_name(index), doc_type=percolator_doc_type, id="oaiset-{}".format(spec), body={"query": query}, ) except Exception as e: current_app.logger.warning(e)
def index_record_modification(sender, changes): """Reset the set of processed records for the next session.""" records_to_index = flask.g.get('invenio_search_records_to_index', dict()) records_to_delete = flask.g.get('invenio_search_records_to_delete', set()) for id in records_to_index: if id not in records_to_delete: current_search_client.index( index='invenio_records_rest_test_index', doc_type='record', id=id, body=records_to_index[id].body, version=records_to_index[id].version, version_type='external_gte', ) for id in records_to_delete: current_search_client.delete( index='invenio_records_rest_test_index', doc_type='record', id=id, ) flask.g.invenio_search_records_to_index = dict() flask.g.invenio_search_records_to_delete = set()
def test_match_signature_with_existing_profile(small_app): """Check the module for the case with signatures and existing profile.""" from inspirehep.modules.disambiguation.tasks import ( disambiguation_clustering, update_authors_recid ) old_record_id = str(PersistentIdentifier.get('lit', 11883).object_uuid) old_record = get_es_record_by_uuid(old_record_id) old_author_uuid = old_record['authors'][0]['uuid'] # Add phonetic block to the record. old_record['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=old_record_id, body=old_record) es.indices.refresh('records-hep') record_id = str(PersistentIdentifier.get('lit', 1358492).object_uuid) record = get_es_record_by_uuid(record_id) author_uuid = record['authors'][0]['uuid'] # Add phonetic block to the record. record['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=record_id, body=record) es.indices.refresh('records-hep') with patch("celery.current_app.send_task", return_value=_BeardObject( ({"1": [old_author_uuid, author_uuid]}, {}))): with patch("inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid): disambiguation_clustering("HAGp") assert InspireRecord.get_record(old_record_id)['authors'][0]['recid'] == "1" assert InspireRecord.get_record(record_id)['authors'][0]['recid'] == "1"
def test_solve_claim_conflicts(small_app): """Check the module for the case where at least two claimed signatures are assigned to the same cluster. """ from inspirehep.modules.disambiguation.tasks import disambiguation_clustering, update_authors_recid # Claimed signature #1. glashow_record_id_claimed = str(PersistentIdentifier.get("literature", 4328).object_uuid) glashow_record_claimed = get_es_record_by_uuid(glashow_record_id_claimed) glashow_record_uuid_claimed = glashow_record_claimed["authors"][0]["uuid"] # Add phonetic block to the record. glashow_record_claimed["authors"][0]["signature_block"] = "HAGp" glashow_record_claimed["authors"][0]["curated_relation"] = True glashow_record_claimed["authors"][0]["recid"] = "3" es.index(index="records-hep", doc_type="hep", id=glashow_record_id_claimed, body=glashow_record_claimed) es.indices.refresh("records-hep") # Claimed signature #2. higgs_record_id_claimed = str(PersistentIdentifier.get("literature", 1358492).object_uuid) higgs_record_claimed = get_es_record_by_uuid(higgs_record_id_claimed) higgs_record_uuid_claimed = higgs_record_claimed["authors"][0]["uuid"] # Add phonetic block to the record. higgs_record_claimed["authors"][0]["signature_block"] = "HAGp" higgs_record_claimed["authors"][0]["curated_relation"] = True higgs_record_claimed["authors"][0]["recid"] = "4" es.index(index="records-hep", doc_type="hep", id=higgs_record_id_claimed, body=higgs_record_claimed) es.indices.refresh("records-hep") # Not claimed signature. higgs_record_id_not_claimed = str(PersistentIdentifier.get("literature", 11883).object_uuid) higgs_record_not_claimed = get_es_record_by_uuid(higgs_record_id_not_claimed) higgs_record_uuid_not_claimed = higgs_record_not_claimed["authors"][0]["uuid"] # Add phonetic block to the record. higgs_record_not_claimed["authors"][0]["signature_block"] = "HAGp" es.index(index="records-hep", doc_type="hep", id=higgs_record_id_not_claimed, body=higgs_record_not_claimed) es.indices.refresh("records-hep") with patch( "celery.current_app.send_task", return_value=_BeardObject( ({"3": [glashow_record_uuid_claimed, higgs_record_uuid_claimed, higgs_record_uuid_not_claimed]}, {}) ), ): with patch( "inspirehep.modules.disambiguation.logic._solve_claims_conflict", return_value=_ConflictObject({higgs_record_uuid_claimed: [higgs_record_uuid_not_claimed]}), ): with patch( "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid ): disambiguation_clustering("HAGp") assert Record.get_record(higgs_record_id_not_claimed)["authors"][0]["recid"] == "4"
def index_record_modification(sender, changes): """Reset the set of processed records for the next session.""" records_to_index = flask.g.get('invenio_search_records_to_index', dict()) records_to_delete = flask.g.get('invenio_search_records_to_delete', set()) es_index = current_app.config["RECORDS_REST_DEFAULT_SEARCH_INDEX"] for id in records_to_index: if id not in records_to_delete: current_search_client.index( index=es_index, doc_type='testrecord-v1.0.0', id=id, body=records_to_index[id].body, version=records_to_index[id].version, version_type='external_gte', ) for id in records_to_delete: current_search_client.delete( index=es_index, doc_type='testrecord-v1.0.0', id=id, ) flask.g.invenio_search_records_to_index = dict() flask.g.invenio_search_records_to_delete = set()
def create_from_kwargs(cls, index_name='', disable_persistent_identifier=False, **kwargs): instance = cls() updated_kwargs = copy.deepcopy(kwargs) if not kwargs.pop('id', None): updated_kwargs['id'] = uuid.uuid4() json_ = copy.deepcopy(cls.JSON_SKELETON) json_.update(kwargs.pop('json', {})) if kwargs.get('pid_type', 'lit') == 'lit' and 'titles' not in json_: json_.update({ 'titles': [ { 'title': generate_random_string(60) } ] }) if 'control_number' not in json_: json_['control_number'] = get_next_free_recid() else: reserve_recid(json_['control_number']) updated_kwargs['json'] = json_ instance.record_metadata = super(TestRecordMetadata, cls)\ .create_from_kwargs(updated_kwargs) if index_name: instance.es_index_result = es.index( index=index_name, doc_type=index_name.split('-')[-1], body=instance.record_metadata.json, params={} ) instance.es_refresh_result = es.indices.refresh(index_name) if not disable_persistent_identifier: instance.persistent_identifier = TestPersistentIdentifier\ .create_from_kwargs( object_uuid=instance.record_metadata.id, pid_value=instance.record_metadata.json.get('control_number'), **kwargs).persistent_identifier instance.inspire_record = InspireRecord(instance.record_metadata.json, model=RecordMetadata) return instance
def create_from_kwargs(cls, index_name='', disable_persistent_identifier=False, **kwargs): instance = cls() updated_kwargs = copy.deepcopy(kwargs) if not kwargs.pop('id', None): updated_kwargs['id'] = uuid.uuid4() json_ = copy.deepcopy(cls.JSON_SKELETON) json_.update(kwargs.pop('json', {})) if kwargs.get('pid_type', 'lit') == 'lit' and 'titles' not in json_: json_.update({ 'titles': [ { 'title': generate_random_string(60) } ] }) if 'control_number' not in json_: json_['control_number'] = get_next_free_recid() else: reserve_recid(json_['control_number']) updated_kwargs['json'] = json_ instance.record_metadata = super(TestRecordMetadata, cls)\ .create_from_kwargs(updated_kwargs) if index_name: instance.es_index_result = es.index( index=index_name, body=instance.record_metadata.json, params={} ) instance.es_refresh_result = es.indices.refresh(index_name) if not disable_persistent_identifier: instance.persistent_identifier = TestPersistentIdentifier\ .create_from_kwargs( object_uuid=instance.record_metadata.id, pid_value=instance.record_metadata.json.get('control_number'), **kwargs).persistent_identifier instance.inspire_record = InspireRecord(instance.record_metadata.json, model=RecordMetadata) return instance
def test_count_phonetic_block_dispatched(small_app): """Count if two phonetic blocks were dispatched.""" from inspirehep.modules.disambiguation.tasks import ( disambiguation_daemon, ) # Check if the queue has three records. assert DisambiguationRecord.query.count() == 3 # Signature #1. glashow_record_id = str( PersistentIdentifier.get("literature", 4328).object_uuid) glashow_record = get_es_record_by_uuid(glashow_record_id) # Add phonetic block to the record. glashow_record['authors'][0]['signature_block'] = "GLASs" es.index(index='records-hep', doc_type='hep', id=glashow_record_id, body=glashow_record) es.indices.refresh('records-hep') # Signature #2. higgs_record_id_first = str( PersistentIdentifier.get("literature", 1358492).object_uuid) higgs_record_first = get_es_record_by_uuid(higgs_record_id_first) # Add phonetic block to the record. higgs_record_first['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=higgs_record_id_first, body=higgs_record_first) es.indices.refresh('records-hep') # Signature #3. higgs_record_id_second = str( PersistentIdentifier.get("literature", 11883).object_uuid) higgs_record_second = get_es_record_by_uuid(higgs_record_id_second) # Add phonetic block to the record. higgs_record_second['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=higgs_record_id_second, body=higgs_record_second) es.indices.refresh('records-hep') with patch("celery.current_app.send_task") as send_to_clustering: disambiguation_daemon() assert send_to_clustering.call_count == 2
def create_from_kwargs(cls, index_name="", disable_persistent_identifier=False, **kwargs): instance = cls() updated_kwargs = copy.deepcopy(kwargs) if not kwargs.pop("id", None): updated_kwargs["id"] = uuid.uuid4() json_ = copy.deepcopy(cls.JSON_SKELETON) json_.update(kwargs.pop("json", {})) if kwargs.get("pid_type", "lit") == "lit" and "titles" not in json_: json_.update({"titles": [{"title": generate_random_string(60)}]}) if "control_number" not in json_: json_["control_number"] = get_next_free_recid() else: reserve_recid(json_["control_number"]) updated_kwargs["json"] = json_ instance.record_metadata = super( TestRecordMetadata, cls).create_from_kwargs(updated_kwargs) if index_name: instance.es_index_result = es.index( index=index_name, body=instance.record_metadata.json, params={}) instance.es_refresh_result = es.indices.refresh(index_name) if not disable_persistent_identifier: instance.persistent_identifier = ( TestPersistentIdentifier.create_from_kwargs( object_uuid=instance.record_metadata.id, pid_value=instance.record_metadata.json.get( "control_number"), **kwargs).persistent_identifier) instance.inspire_record = InspireRecord(instance.record_metadata.json, model=RecordMetadata) return instance
def create_from_kwargs(cls, index=True, has_pid=True, **kwargs): instance = cls() updated_kwargs = copy.deepcopy(kwargs) if not kwargs.pop('id', None): updated_kwargs['id'] = uuid.uuid4() json_ = copy.deepcopy(cls.JSON_SKELETON) json_.update(kwargs.pop('json', {})) if 'titles' not in json_: json_.update({ 'titles': [ { 'title': generate_random_string(60) } ] }) if 'control_number' not in json_: json_['control_number'] = random.randint(1, 9) * 5 updated_kwargs['json'] = json_ instance.record_metadata = super(TestRecordMetadata, cls)\ .create_from_kwargs(updated_kwargs) if index: instance.es_index_result = es.index( index='records-hep', doc_type='hep', body=instance.record_metadata.json, params={} ) instance.es_refresh_result = es.indices.refresh('records-hep') if has_pid: instance.persistent_identifier = TestPersistentIdentifier\ .create_from_kwargs( object_uuid=instance.record_metadata.id, pid_value=instance.record_metadata.json.get('control_number') ).persistent_identifier return instance
def update(self): """Update any internal representation / index for the acl.""" body = { '__acl_record_selector': self.record_selector, '__acl_record_type': self.type } if logger.isEnabledFor(logging.DEBUG) <= logging.DEBUG: logger.debug('get_material_acls: query %s', json.dumps(body, indent=4, ensure_ascii=False)) schema_indices = [schema_to_index(x)[0] for x in self.schemas] acl_index_names = [self.get_acl_index_name(x) for x in schema_indices] for acl_idx_name in acl_index_names: try: resp = current_search_client.index( index=acl_idx_name, **add_doc_type(current_app.config['INVENIO_EXPLICIT_ACLS_DOCTYPE_NAME']), id=self.id, body=body, refresh='wait_for' ) assert resp['result'] in ('created', 'updated') finally: current_search_client.indices.flush(index=acl_idx_name)
def test_count_phonetic_block_dispatched(small_app): """Count if two phonetic blocks were dispatched.""" from inspirehep.modules.disambiguation.tasks import ( disambiguation_daemon, ) # Check if the queue has three records. assert DisambiguationRecord.query.count() == 3 # Signature #1. glashow_record_id = str(PersistentIdentifier.get( "literature", 4328).object_uuid) glashow_record = get_es_record_by_uuid(glashow_record_id) # Add phonetic block to the record. glashow_record['authors'][0]['signature_block'] = "GLASs" es.index(index='records-hep', doc_type='hep', id=glashow_record_id, body=glashow_record) es.indices.refresh('records-hep') # Signature #2. higgs_record_id_first = str(PersistentIdentifier.get( "literature", 1358492).object_uuid) higgs_record_first = get_es_record_by_uuid(higgs_record_id_first) # Add phonetic block to the record. higgs_record_first['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=higgs_record_id_first, body=higgs_record_first) es.indices.refresh('records-hep') # Signature #3. higgs_record_id_second = str(PersistentIdentifier.get( "literature", 11883).object_uuid) higgs_record_second = get_es_record_by_uuid(higgs_record_id_second) # Add phonetic block to the record. higgs_record_second['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=higgs_record_id_second, body=higgs_record_second) es.indices.refresh('records-hep') with patch("celery.current_app.send_task") as send_to_clustering: disambiguation_daemon() assert send_to_clustering.call_count == 2
def index_documents(docs): """Index a list of documents into ES.""" for doc in docs: current_search_client.index(index='relationships', doc_type='doc', body=doc)
def test_solve_claim_conflicts(small_app): """Check the module for the case where at least two claimed signatures are assigned to the same cluster. """ from inspirehep.modules.disambiguation.tasks import ( disambiguation_clustering, update_authors_recid) # Claimed signature #1. glashow_record_id_claimed = str( PersistentIdentifier.get('lit', 4328).object_uuid) glashow_record_claimed = get_es_record_by_uuid(glashow_record_id_claimed) glashow_record_uuid_claimed = glashow_record_claimed['authors'][0]['uuid'] # Add phonetic block to the record. glashow_record_claimed['authors'][0]['signature_block'] = "HAGp" glashow_record_claimed['authors'][0]['curated_relation'] = True glashow_record_claimed['authors'][0]['recid'] = "3" es.index(index='records-hep', doc_type='hep', id=glashow_record_id_claimed, body=glashow_record_claimed) es.indices.refresh('records-hep') # Claimed signature #2. higgs_record_id_claimed = str( PersistentIdentifier.get('lit', 1358492).object_uuid) higgs_record_claimed = get_es_record_by_uuid(higgs_record_id_claimed) higgs_record_uuid_claimed = higgs_record_claimed['authors'][0]['uuid'] # Add phonetic block to the record. higgs_record_claimed['authors'][0]['signature_block'] = "HAGp" higgs_record_claimed['authors'][0]['curated_relation'] = True higgs_record_claimed['authors'][0]['recid'] = "4" es.index(index='records-hep', doc_type='hep', id=higgs_record_id_claimed, body=higgs_record_claimed) es.indices.refresh('records-hep') # Not claimed signature. higgs_record_id_not_claimed = str( PersistentIdentifier.get('lit', 11883).object_uuid) higgs_record_not_claimed = get_es_record_by_uuid( higgs_record_id_not_claimed) higgs_record_uuid_not_claimed = higgs_record_not_claimed['authors'][0][ 'uuid'] # Add phonetic block to the record. higgs_record_not_claimed['authors'][0]['signature_block'] = "HAGp" es.index(index='records-hep', doc_type='hep', id=higgs_record_id_not_claimed, body=higgs_record_not_claimed) es.indices.refresh('records-hep') with patch("celery.current_app.send_task", return_value=_BeardObject(({ "3": [ glashow_record_uuid_claimed, higgs_record_uuid_claimed, higgs_record_uuid_not_claimed ] }, {}))): with patch( "inspirehep.modules.disambiguation.logic._solve_claims_conflict", return_value=_ConflictObject({ higgs_record_uuid_claimed: [higgs_record_uuid_not_claimed] })): with patch( "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid): disambiguation_clustering("HAGp") assert InspireRecord.get_record( higgs_record_id_not_claimed)['authors'][0]['recid'] == "4"