def __init__(self, json_data, metadata_provider): """Constructor.""" self.json_data = json_data self.metadata_provider = metadata_provider priority = current_app.config["CDS_ILS_IMPORTER_PROVIDERS"][ metadata_provider ]["priority"] self.document_importer = DocumentImporter( json_data, self.HELPER_METADATA_FIELDS, metadata_provider, self.UPDATE_DOCUMENT_FIELDS, ) self.eitem_importer = EItemImporter( json_data, metadata_provider, priority, self.IS_PROVIDER_PRIORITY_SENSITIVE, self.EITEM_OPEN_ACCESS, self.EITEM_URLS_LOGIN_REQUIRED, ) series_json = json_data.get("_serial", None) self.series_importer = SeriesImporter(series_json, metadata_provider) self.ambiguous_matches = [] self.created = None self.updated = None self.series_list = [] self.fuzzy_matches = []
def test_replace_lower_priority(importer_test_data): document_cls = current_app_ils.document_record_cls eitem_cls = current_app_ils.eitem_record_cls eitem_search_cls = current_app_ils.eitem_search_cls # setup matched_document = document_cls.get_record_by_pid("docid-6") current_import_eitem = { "urls": [{ "description": "Protected URL", "value": "http://protected-cds-ils.ch/", "login_required": True }, { "description": "Another open URL", "value": "http://cds-ils.ch/", "login_required": True }] } metadata_provider = "springer" IS_PROVIDER_PRIORITY_SENSITIVE = True EITEM_OPEN_ACCESS = False EITEM_URLS_LOGIN_REQUIRED = True eitem_importer_preview = EItemImporter(matched_document, current_import_eitem, metadata_provider, IS_PROVIDER_PRIORITY_SENSITIVE, EITEM_OPEN_ACCESS, EITEM_URLS_LOGIN_REQUIRED) eitem_importer = EItemImporter(matched_document, current_import_eitem, metadata_provider, IS_PROVIDER_PRIORITY_SENSITIVE, EITEM_OPEN_ACCESS, EITEM_URLS_LOGIN_REQUIRED) preview_summary = eitem_importer_preview.preview_import(matched_document) # make sure ebl item exists eitem_cls.get_record_by_pid("eitemid-6") eitem_importer.update_eitems(matched_document) summary = eitem_importer.summary() current_search.flush_and_refresh(index="*") assert len(summary["deleted_eitems"]) == 1 # check if replaced in the import summary assert summary["deleted_eitems"][0]["pid"] == "eitemid-6" assert summary["eitem"]["document_pid"] == "docid-6" # check if deleted with pytest.raises(PIDDeletedError): eitem_cls.get_record_by_pid("eitemid-6") # check if deleted from the index search = eitem_search_cls().search_by_document_pid("docid-6") assert search.count() == 0 # check if preview equals report # this should be the only differing item summary["output_pid"] = "preview-doc-pid" assert preview_summary == summary
def test_import_equal_priority(importer_test_data): document_cls = current_app_ils.document_record_cls eitem_cls = current_app_ils.eitem_record_cls # setup matched_document = document_cls.get_record_by_pid("docid-6A") current_import_eitem = { "urls": [{ "description": "Protected URL", "value": "http://protected-cds-ils.ch/", "login_required": True }, { "description": "Another open URL", "value": "http://cds-ils.ch/", "login_required": True }] } metadata_provider = "ebl" IS_PROVIDER_PRIORITY_SENSITIVE = False EITEM_OPEN_ACCESS = False EITEM_URLS_LOGIN_REQUIRED = True eitem_importer = EItemImporter(matched_document, current_import_eitem, metadata_provider, IS_PROVIDER_PRIORITY_SENSITIVE, EITEM_OPEN_ACCESS, EITEM_URLS_LOGIN_REQUIRED) preview_eitem_importer = EItemImporter(matched_document, current_import_eitem, metadata_provider, IS_PROVIDER_PRIORITY_SENSITIVE, EITEM_OPEN_ACCESS, EITEM_URLS_LOGIN_REQUIRED) preview_summary = preview_eitem_importer.preview_import(matched_document) eitem_importer.update_eitems(matched_document) summary = eitem_importer.summary() assert len(summary["deleted_eitems"]) == 0 # check if replaced in the import summary assert summary["eitem"]["document_pid"] == "docid-6A" # check if safari not deleted eitem_cls.get_record_by_pid("eitemid-6A") # check if new record added eitem_cls.get_record_by_pid(summary["eitem"]["pid"]) # check if preview equals report summary["output_pid"] = "preview-doc-pid" assert preview_summary == summary
def test_do_not_import_lower_priority(importer_test_data): document_cls = current_app_ils.document_record_cls eitem_cls = current_app_ils.eitem_record_cls # setup matched_document = document_cls.get_record_by_pid("docid-7") current_import_eitem = { "urls": [{ "description": "Protected URL", "value": "http://protected-cds-ils.ch/", "login_required": True }, { "description": "Another open URL", "value": "http://cds-ils.ch/", "login_required": True }] } metadata_provider = "ebl" IS_PROVIDER_PRIORITY_SENSITIVE = False EITEM_OPEN_ACCESS = False EITEM_URLS_LOGIN_REQUIRED = True eitem_importer = EItemImporter(matched_document, current_import_eitem, metadata_provider, IS_PROVIDER_PRIORITY_SENSITIVE, EITEM_OPEN_ACCESS, EITEM_URLS_LOGIN_REQUIRED) preview_eitem_importer = EItemImporter(matched_document, current_import_eitem, metadata_provider, IS_PROVIDER_PRIORITY_SENSITIVE, EITEM_OPEN_ACCESS, EITEM_URLS_LOGIN_REQUIRED) preview_summary = preview_eitem_importer.preview_import(matched_document) eitem_importer.update_eitems(matched_document) current_search.flush_and_refresh(index="*") summary = eitem_importer.summary() assert len(summary["deleted_eitems"]) == 0 # check if doing nothing assert summary["eitem"] is None assert summary["action"] == "none" # check if higher priority record not deleted eitem_cls.get_record_by_pid("eitemid-7") # check if preview equals report summary["output_pid"] = "preview-doc-pid" assert preview_summary == summary
def __init__(self, json_data, metadata_provider): """Constructor.""" self.json_data = json_data self.metadata_provider = metadata_provider eitem_json_data = self._extract_eitems_json() document_importer_class = self.get_document_importer(metadata_provider) self.document_importer = document_importer_class( json_data, self.HELPER_METADATA_FIELDS, metadata_provider, self.UPDATE_DOCUMENT_FIELDS, ) self.eitem_importer = EItemImporter( json_data, eitem_json_data, metadata_provider, self.IS_PROVIDER_PRIORITY_SENSITIVE, self.EITEM_OPEN_ACCESS, self.EITEM_URLS_LOGIN_REQUIRED, ) series_json = json_data.get("_serial", None) self.series_importer = SeriesImporter(series_json, metadata_provider)
class Importer(object): """Importer class.""" UPDATE_DOCUMENT_FIELDS = ("identifiers",) IS_PROVIDER_PRIORITY_SENSITIVE = False EITEM_OPEN_ACCESS = True EITEM_URLS_LOGIN_REQUIRED = True HELPER_METADATA_FIELDS = ( "_eitem", "agency_code", "_serial", "provider_recid", ) def __init__(self, json_data, metadata_provider): """Constructor.""" self.json_data = json_data self.metadata_provider = metadata_provider priority = current_app.config["CDS_ILS_IMPORTER_PROVIDERS"][ metadata_provider ]["priority"] self.document_importer = DocumentImporter( json_data, self.HELPER_METADATA_FIELDS, metadata_provider, self.UPDATE_DOCUMENT_FIELDS, ) self.eitem_importer = EItemImporter( json_data, metadata_provider, priority, self.IS_PROVIDER_PRIORITY_SENSITIVE, self.EITEM_OPEN_ACCESS, self.EITEM_URLS_LOGIN_REQUIRED, ) series_json = json_data.get("_serial", None) self.series_importer = SeriesImporter(series_json, metadata_provider) self.ambiguous_matches = [] self.created = None self.updated = None self.series_list = [] self.fuzzy_matches = [] def _validate_provider(self): """Check if the chosen provider is matching the import data.""" assert ( self.json_data["agency_code"] == current_app.config["CDS_ILS_IMPORTER_PROVIDERS"][ self.metadata_provider ]["agency_code"] ) def _match_document(self): """Search the catalogue for existing document.""" document_class = current_app_ils.document_record_cls matching_pids = self.document_importer.search_for_matching_documents() if len(matching_pids) == 1: return document_class.get_record_by_pid(matching_pids[0]) self.ambiguous_matches = matching_pids fuzzy_results = self.document_importer.fuzzy_match_documents() self.fuzzy_matches = [x.pid for x in fuzzy_results] def import_summary(self): """Provide import summary.""" return { "created": self.created, "updated": self.updated, "ambiguous_documents": self.ambiguous_matches, "fuzzy": self.fuzzy_matches, "series": self.series_list, "created_eitem": self.eitem_importer.created, "updated_eitem": self.eitem_importer.updated, "deleted_eitem_list": self.eitem_importer.deleted_list, "ambiguous_eitem_list": self.eitem_importer.ambiguous_list, } def update_records(self, matched_document): """Update document eitem and series records.""" self.document_importer.update_document(matched_document) self.eitem_importer.update_eitems(matched_document) self.series_list = self.series_importer.import_series(matched_document) self.updated = matched_document def delete_records(self, matched_document): """Deletes eitems records.""" self.eitem_importer.delete_eitems(matched_document) if self.eitem_importer.deleted_list: self.updated = matched_document def index_all_records(self): """Index imported records.""" document_indexer = current_app_ils.document_indexer series_indexer = current_app_ils.series_indexer eitem_indexer = current_app_ils.eitem_indexer eitem = self.eitem_importer.updated or self.eitem_importer.created if eitem: eitem_indexer.index(eitem) document_indexer.index(self.created or self.updated) for series in self.series_list: series_indexer.index(series) def import_record(self): """Import record.""" self._validate_provider() # finds the exact match, update records matched_document = self._match_document() if matched_document: self.update_records(matched_document) self.index_all_records() return self.import_summary() # finds the multiple matches or fuzzy matches, does not create new doc # requires manual intervention, to avoid duplicates if self.ambiguous_matches or self.fuzzy_matches: return self.import_summary() document = self.document_importer.create_document() if document: self.eitem_importer.create_eitem(document) self.created = document self.series_list = self.series_importer.import_series(document) self.index_all_records() return self.import_summary() def delete_record(self): """Deletes the eitems of the record.""" self._validate_provider() # finds the exact match, update records matched_document = self._match_document() if matched_document: self.delete_records(matched_document) return self.import_summary()
class Importer(object): """Importer class.""" UPDATE_DOCUMENT_FIELDS = ("identifiers", "alternative_identifiers") IS_PROVIDER_PRIORITY_SENSITIVE = False EITEM_OPEN_ACCESS = True EITEM_URLS_LOGIN_REQUIRED = True HELPER_METADATA_FIELDS = ( "_eitem", "agency_code", "_serial", "provider_recid", "_migration", ) def __init__(self, json_data, metadata_provider): """Constructor.""" self.json_data = json_data self.metadata_provider = metadata_provider eitem_json_data = self._extract_eitems_json() document_importer_class = self.get_document_importer(metadata_provider) self.document_importer = document_importer_class( json_data, self.HELPER_METADATA_FIELDS, metadata_provider, self.UPDATE_DOCUMENT_FIELDS, ) self.eitem_importer = EItemImporter( json_data, eitem_json_data, metadata_provider, self.IS_PROVIDER_PRIORITY_SENSITIVE, self.EITEM_OPEN_ACCESS, self.EITEM_URLS_LOGIN_REQUIRED, ) series_json = json_data.get("_serial", None) self.series_importer = SeriesImporter(series_json, metadata_provider) def get_document_importer(self, provider, default=DocumentImporter): """Determine which document importer to use.""" try: return pkg_resources.load_entry_point( "cds-ils", "cds_ils.document_importers", provider) except Exception: return default def _validate_provider(self): """Check if the chosen provider is matching the import data.""" agency_code = self.json_data.get("agency_code") config_agency_code = current_app.config["CDS_ILS_IMPORTER_PROVIDERS"][ self.metadata_provider]["agency_code"] if not agency_code: raise UnknownProvider if agency_code != config_agency_code: raise InvalidProvider def _extract_eitems_json(self): """Extracts eitems json for given pre-processed JSON.""" return self.json_data["_eitem"] def _match_document(self): """Search the catalogue for existing document.""" document_importer = self.document_importer not_validated_matches = \ document_importer.search_for_matching_documents() exact_match, partial_matching_pids = \ document_importer.validate_found_matches(not_validated_matches) return exact_match, partial_matching_pids def delete_eitem(self, matched_document): """Deletes eitems records.""" self.eitem_importer.delete_eitems(matched_document) return self.eitem_importer.summary() def index_records(self, document, eitem, series_list): """Index imported records.""" # we are using general indexer instead of type dedicated classes # in order to avoid version mismatch on references record_indexer = RecordIndexer() document_class = current_app_ils.document_record_cls series_class = current_app_ils.series_record_cls eitem_class = current_app_ils.eitem_record_cls if eitem["output_pid"]: eitem = eitem_class.get_record_by_pid(eitem["output_pid"]) record_indexer.index(eitem) document = document_class.get_record_by_pid(document["pid"]) record_indexer.index(document) for series in series_list: series_record = \ series_class.get_record_by_pid(series["series_record"]["pid"]) record_indexer.index(series_record) def find_partial_matches(self, pids_list=None, exact_match=None): """Get all partial matches.""" if pids_list is None: pids_list = [] # ambiguous = matching fails # (inconsistent identifiers/title pairs, duplicates etc) amibiguous_matches = [{ "pid": match, "type": "ambiguous" } for match in pids_list] # fuzzy = trying to match similar titles and authors to spot typos try: fuzzy_results = self.document_importer.fuzzy_match_documents() fuzzy_matches = [{ "pid": match.pid, "type": "similar" } for match in fuzzy_results if match.pid != exact_match] except TransportError: raise SimilarityMatchUnavailable return fuzzy_matches + amibiguous_matches def update_exact_match(self, exact_match): """Update exact importing record match.""" document_class = current_app_ils.document_record_cls matched_document = document_class.get_record_by_pid(exact_match) self.document_importer.update_document(matched_document) self.eitem_importer.update_eitems(matched_document) eitem = self.eitem_importer.summary() series = self.series_importer.import_series(matched_document) return matched_document, eitem, series def import_record(self): """Import record.""" document, eitem, series = None, None, None action = None self._validate_provider() exact_match, partial_matches = self._match_document() # finds the multiple matches or fuzzy matches, does not create new doc # requires manual intervention, to avoid duplicates partial_matches = self.find_partial_matches(partial_matches, exact_match) # finds the exact match, update records if exact_match: document, eitem, series = self.update_exact_match(exact_match) self.index_records(document, eitem, series) return self.report(document=document, action="update", partial_matches=partial_matches, eitem=eitem, series=series) document = self.document_importer.create_document() if document: action = "create" self.eitem_importer.create_eitem(document) eitem = self.eitem_importer.summary() series = self.series_importer.import_series(document) self.index_records(document, eitem, series) return self.report(document=document, action=action, partial_matches=partial_matches, eitem=eitem, series=series) def delete_record(self): """Deletes the eitems of the record.""" document_indexer = current_app_ils.document_indexer series_class = current_app_ils.series_record_cls document_class = current_app_ils.document_record_cls self._validate_provider() exact_match, partial_matches = self._match_document() partial_matches = self.find_partial_matches(partial_matches, exact_match) if exact_match: matched_document = document_class.get_record_by_pid(exact_match) eitem = self.delete_eitem(matched_document) db.session.commit() current_search.flush_and_refresh(index="*") document_has_only_serial_relations = \ len(matched_document.relations.keys()) \ and 'serial' in matched_document.relations.keys() if not matched_document.has_references() \ or document_has_only_serial_relations: # remove serial relations rr = RecordRelationsParentChild() serial_relations = matched_document.relations.get('serial', []) relation_type = Relation.get_relation_by_name("serial") for relation in serial_relations: serial = series_class.get_record_by_pid( relation["pid_value"]) rr.remove(serial, matched_document, relation_type) pid = matched_document.pid # will fail if any relations / references present matched_document.delete() # mark all PIDs as DELETED all_pids = PersistentIdentifier.query.filter( PersistentIdentifier.object_type == pid.object_type, PersistentIdentifier.object_uuid == pid.object_uuid, ).all() for rec_pid in all_pids: if not rec_pid.is_deleted(): rec_pid.delete() db.session.commit() document_indexer.delete(matched_document) return self.report(document=matched_document, action="delete", partial_matches=partial_matches, eitem=eitem) return self.report(partial_matches=partial_matches) def report(self, document=None, action="none", partial_matches=None, eitem=None, series=None): """Generate import report.""" doc_json = {} doc_pid = None if document: doc_json = document.dumps() doc_pid = document["pid"] return { "output_pid": doc_pid, "action": action, "partial_matches": partial_matches, "eitem": eitem, "series": series, "raw_json": self.json_data, "document_json": doc_json, } def preview_delete(self): """Preview deleting a record.""" document_class = current_app_ils.document_record_cls document, eitem = None, None self._validate_provider() action = "none" # finds the exact match, update records exact_match, partial_matches = self._match_document() partial_matches = self.find_partial_matches(partial_matches, exact_match) if exact_match: document = document_class.get_record_by_pid(exact_match) eitem = self.eitem_importer.preview_delete(document) self.preview_delete_document(document) action = "delete" return self.report(document=document, action=action, partial_matches=partial_matches, eitem=eitem) def preview_import(self): """Previews the record import.""" document_class = current_app_ils.document_record_cls self._validate_provider() exact_match, partial_matches = self._match_document() # finds the multiple matches or fuzzy matches, does not create new doc # requires manual intervention, to avoid duplicates partial_matches = self.find_partial_matches(partial_matches, exact_match) if exact_match: document = document_class.get_record_by_pid(exact_match) document = self.document_importer.preview_document_update(document) action = "update" else: document = self.document_importer.preview_document_import() action = "create" eitem = self.eitem_importer.preview_import(document) series = self.series_importer.preview_import_series() if partial_matches: action = "error" return self.report(document=document, action=action, eitem=eitem, series=series, partial_matches=partial_matches) def preview_delete_document(self, document): """Delete Document record.""" loan_search_res = document.search_loan_references() item_search_res = document.search_item_references() req_search_res = document.search_doc_req_references() orders_refs_search = document.search_order_references() brw_req_refs_search = document.search_brw_req_references() if loan_search_res.count(): raise DocumentHasReferencesError( document=document, ref_type="Loan", refs=loan_search_res, ) if item_search_res.count(): raise DocumentHasReferencesError( document=document, ref_type="Item", refs=item_search_res, ) if req_search_res.count(): raise DocumentHasReferencesError( document=document, ref_type="DocumentRequest", refs=req_search_res, ) if orders_refs_search.count(): raise DocumentHasReferencesError( document=document, ref_type="AcquisitionOrder", refs=orders_refs_search, ) if brw_req_refs_search.count(): raise DocumentHasReferencesError( document=document, ref_type="BorrowingRequest", refs=brw_req_refs_search, ) related_refs = set() for _, related_objects in document.relations.items(): for obj in related_objects: if not obj["record_metadata"].get("mode_of_issuance") \ == "SERIAL": related_refs.add("{pid_value}:{pid_type}".format(**obj)) if related_refs: raise RecordHasReferencesError( record_type=document.__class__.__name__, record_id=document["pid"], ref_type="related", ref_ids=sorted(ref for ref in related_refs), )