def match_existing_release_fuzzy( self, release: ReleaseEntity ) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for existing release entities with similar metadata. Returns None if there was no match of any kind, or a single tuple (status: str, reason: str, existing: ReleaseEntity) if there was a match. Status string is one of the fuzzycat.common.Status, with "strongest match" in this sorted order: - EXACT - STRONG - WEAK - AMBIGUOUS Eg, if there is any EXACT match that is always returned; an AMBIGUOUS result is only returned if all the candidate matches were ambiguous. """ # this map used to establish priority order of verified matches STATUS_SORT = { fuzzycat.common.Status.TODO: 0, fuzzycat.common.Status.EXACT: 10, fuzzycat.common.Status.STRONG: 20, fuzzycat.common.Status.WEAK: 30, fuzzycat.common.Status.AMBIGUOUS: 40, fuzzycat.common.Status.DIFFERENT: 60, } # TODO: the size here is a first guess; what should it really be? candidates = match_release_fuzzy(release, size=10, es=self.es_client) if not candidates: return None release_dict = entity_to_dict(release, api_client=self.api.api_client) verified = [( fuzzycat.verify.verify( release_dict, entity_to_dict(c, api_client=self.api.api_client)), c, ) for c in candidates] # chose the "closest" match closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] if closest[0].status == fuzzycat.common.Status.DIFFERENT: return None elif closest[0].status == fuzzycat.common.Status.TODO: raise NotImplementedError("fuzzycat verify hit a Status.TODO") else: return (closest[0].status.name, closest[0].reason.value, closest[1])
def try_update(self, re): """ When debug is true, write the RE to stdout, not to the database. Might hide schema mismatch bugs. """ if self.debug is True: print(json.dumps(entity_to_dict(re, api_client=None))) return False # lookup existing DOI (don't need to try other ext idents for crossref) existing = None try: existing = self.api.lookup_release(doi=re.ext_ids.doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err # doesn't exist, need to update return True # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: self.counts['exists'] += 1 return False return True
def insert_batch(self, batch): print('inserting batch ({})'.format(len(batch)), file=sys.stderr) if self.insert_log_file: with open(self.insert_log_file, 'a') as f: for doc in batch: json.dump(entity_to_dict(doc, api_client=None), f) f.write('\n') self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch))
def push_record(self, record: Dict[str, Any]) -> None: """ Intended to be called by "pusher" class (which could be pulling from JSON file, Kafka, whatever). Input is expected to be an entity in JSON-like dict form. Returns nothing. """ self.counts["lines"] += 1 if not record: self.counts["skip-null"] += 1 return entity = entity_from_dict(record, self.entity_type, api_client=self.ac) if entity.state != "active": self.counts["skip-inactive"] += 1 return cleaned = self.clean_entity(copy.deepcopy(entity)) if entity == cleaned: self.counts["skip-clean"] += 1 return else: self.counts["cleaned"] += 1 if self.dry_run_mode: entity_dict = entity_to_dict(entity, api_client=self.ac) print(json.dumps(entity_dict)) return if entity.ident in self._idents_inflight: raise ValueError( "Entity already part of in-process update: {}".format( entity.ident)) updated = self.try_update(cleaned) if updated: self.counts["updated"] += updated self._edit_count += updated self._idents_inflight.append(entity.ident) if self._edit_count >= self.edit_batch_size: self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] return
def test_datacite_conversions(datacite_importer): """ Datacite JSON to release entity JSON representation. The count is hardcoded for now. """ datacite_importer.debug = True for i in range(35): src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i) dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i) with open(src, "r") as f: re = datacite_importer.parse_record(json.load(f)) result = entity_to_dict(re) with open(dst, "r") as f: expected = json.loads(f.read()) assert result == expected, "output mismatch in {}".format(dst)
def parse_record(self, xml_elem: Any) -> Optional[ReleaseEntity]: """ - title => may contain <i>, <sub>, <sup>, <tt> - journal (abbrev?) - volume, pages, number (number -> issue) - publisher - year => for conferences, year of conference not of publication - month - crossref (from inproceedings to specific proceedings volume) - booktitle => for inproceedings, this is the name of conference or workshop. acronym. - isbn """ dblp_key = xml_elem.get("key") if not dblp_key: self.counts["skip-empty-key"] += 1 return False dblp_key_type = dblp_key.split("/")[0] # dblp_prefix may be used for container lookup dblp_prefix = None if dblp_key_type in ("journals", "conf"): dblp_prefix = "/".join(dblp_key.split("/")[:2]) elif dblp_key_type in ("series", "reference", "tr", "books"): dblp_prefix = "/".join(dblp_key.split("/")[:-1]) publtype = xml_elem.get("publtype") or None dblp_type = xml_elem.name if dblp_type not in self.ELEMENT_TYPES: self.counts[f"skip-dblp-type:{dblp_type}"] += 1 if dblp_key_type in ("homepages", "persons", "dblpnote"): self.counts["skip-key-type"] += 1 return False if dblp_key.startswith("journals/corr/"): self.counts["skip-arxiv-corr"] += 1 return False title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) if not title: self.counts["skip-title"] += 1 return False if title.endswith("."): title = title[:-1] release_type = None release_stage = "published" withdrawn_status = None # primary releae_type detection: type of XML element, then prefix of key for granularity if dblp_type == "article": release_type = "article" if dblp_key_type == "journals" and publtype != "informal": release_type = "article-journal" elif dblp_key_type == "tr": release_type = "report" elif title.startswith("Review:"): release_type = "review" elif dblp_type == "inproceedings": release_type = "paper-conference" elif dblp_type == "book": release_type = "book" elif dblp_type == "incollection": # XXX: part vs. chapter? release_type = "chapter" elif dblp_type == "data": release_type = "dataset" elif dblp_type in ("mastersthesis", "phdthesis"): release_type = "thesis" # overrides/extensions of the above if publtype == "informal": # for conferences, seems to indicate peer-review status # for journals, seems to indicate things like book reviews; split out above pass elif publtype == "encyclopedia": release_type = "entry-encyclopedia" elif publtype == "edited": # XXX: article? release_type = "editorial" elif publtype == "data": release_type = "dataset" elif publtype == "data": release_type = "dataset" elif publtype == "software": release_type = "software" elif publtype == "widthdrawn": withdrawn_status = "widthdrawn" elif publtype == "survey": # XXX: flag as a review/survey article? pass # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) container_name = None booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) series = clean_str(xml_elem.series and xml_elem.series.text) if xml_elem.journal: container_name = clean_str(xml_elem.journal.text) container_id = None if dblp_prefix: container_id = self.lookup_dblp_prefix(dblp_prefix) # note: we will skip later if couldn't find prefix publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text) volume = clean_str(xml_elem.volume and xml_elem.volume.text) issue = clean_str(xml_elem.number and xml_elem.number.text) pages = clean_str(xml_elem.pages and xml_elem.pages.text) release_year_str = clean_str(xml_elem.year and xml_elem.year.text) if release_year_str and release_year_str.isdigit(): release_year: Optional[int] = int(release_year_str) else: release_year = None release_month = parse_month( clean_str(xml_elem.month and xml_elem.month.text)) isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text) part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_month = None release_year = None contribs = self.dblp_contribs(xml_elem) ext_ids = self.dblp_ext_ids(xml_elem, dblp_key) if isbn: ext_ids.isbn13 = isbn if ext_ids.doi: self.counts["has-doi"] += 1 # dblp-specific extra dblp_extra = dict(type=dblp_type) note = clean_str(xml_elem.note and xml_elem.note.text) if note and "base-search.net" not in note: dblp_extra["note"] = note if part_of_key: dblp_extra["part_of_key"] = part_of_key # generic extra extra: Dict[str, Any] = dict() if not container_id and container_name: extra["container_name"] = container_name if series and (dblp_key_type == "series" or dblp_type == "book"): extra["series-title"] = series elif series: dblp_extra["series"] = series if booktitle and dblp_key_type == "series": extra["container-title"] = booktitle elif booktitle and dblp_key_type == "conf": extra["event"] = booktitle elif booktitle: dblp_extra["booktitle"] = booktitle if release_year and release_month: # TODO: release_month schema migration extra["release_month"] = release_month if dblp_extra: extra["dblp"] = dblp_extra re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, release_stage=release_stage, withdrawn_status=withdrawn_status, title=title, release_year=release_year, # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs or None, volume=volume, issue=issue, pages=pages, extra=extra or None, ) re = self.biblio_hacks(re) if self.dump_json_mode: re_dict = entity_to_dict(re, api_client=self.api.api_client) re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem) re_dict["_dblp_prefix"] = dblp_prefix print(json.dumps(re_dict, sort_keys=True)) return False if not re.container_id: self.counts["skip-dblp-container-missing"] += 1 return False return re
def parse_record(self, xml_elem): """ - title => may contain <i>, <sub>, <sup>, <tt> - journal (abbrev?) - volume, pages, number (number -> issue) - publisher - year => for conferences, year of conference not of publication - month - crossref (from inproceedings to specific proceedings volume) - booktitle => for inproceedings, this is the name of conference or workshop. acronym. - isbn """ dblp_key = xml_elem.get('key') if not dblp_key: self.counts['skip-empty-key'] += 1 return False dblp_key_type = dblp_key.split('/')[0] # dblp_prefix may be used for container lookup dblp_prefix = None if dblp_key_type in ('journals', 'conf'): dblp_prefix = '/'.join(dblp_key.split('/')[:2]) elif dblp_key_type in ('series', 'reference', 'tr', 'books'): dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) publtype = xml_elem.get('publtype') or None dblp_type = xml_elem.name if dblp_type not in self.ELEMENT_TYPES: self.counts[f'skip-dblp-type:{dblp_type}'] += 1 if dblp_key_type in ('homepages', 'persons', 'dblpnote'): self.counts['skip-key-type'] += 1 return False if dblp_key.startswith('journals/corr/'): self.counts['skip-arxiv-corr'] += 1 return False title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) if not title: self.counts['skip-title'] += 1 return False if title.endswith('.'): title = title[:-1] release_type = None release_stage = 'published' withdrawn_status = None # primary releae_type detection: type of XML element, then prefix of key for granularity if dblp_type == 'article': release_type = 'article' if dblp_key_type == 'journals' and publtype != 'informal': release_type = 'article-journal' elif dblp_key_type == 'tr': release_type = 'report' elif title.startswith("Review:"): release_type = 'review' elif dblp_type == 'inproceedings': release_type = 'paper-conference' elif dblp_type == 'book': release_type = 'book' elif dblp_type == 'incollection': # XXX: part vs. chapter? release_type = 'chapter' elif dblp_type == 'data': release_type = 'dataset' elif dblp_type in ('mastersthesis', 'phdthesis'): release_type = 'thesis' # overrides/extensions of the above if publtype == 'informal': # for conferences, seems to indicate peer-review status # for journals, seems to indicate things like book reviews; split out above pass elif publtype == 'encyclopedia': release_type = 'entry-encyclopedia' elif publtype == 'edited': # XXX: article? release_type = 'editorial' elif publtype == 'data': release_type = 'dataset' elif publtype == 'data': release_type = 'dataset' elif publtype == 'software': release_type = 'software' elif publtype == 'widthdrawn': withdrawn_status = 'widthdrawn' elif publtype == 'survey': # XXX: flag as a review/survey article? pass #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) container_name = None booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) series = clean_str(xml_elem.series and xml_elem.series.text) if xml_elem.journal: container_name = clean_str(xml_elem.journal.text) container_id = None if dblp_prefix: container_id = self.lookup_dblp_prefix(dblp_prefix) # note: we will skip later if couldn't find prefix publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text) volume = clean_str(xml_elem.volume and xml_elem.volume.text) issue = clean_str(xml_elem.number and xml_elem.number.text) pages = clean_str(xml_elem.pages and xml_elem.pages.text) release_year = clean_str(xml_elem.year and xml_elem.year.text) if release_year and release_year.isdigit(): release_year = int(release_year) else: release_year = None release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text)) isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text) part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_month = None release_year = None contribs = self.dblp_contribs(xml_elem or []) ext_ids = self.dblp_ext_ids(xml_elem, dblp_key) if isbn: ext_ids.isbn13 = isbn if ext_ids.doi: self.counts['has-doi'] += 1 # dblp-specific extra dblp_extra = dict(type=dblp_type) note = clean_str(xml_elem.note and xml_elem.note.text) if note and not 'base-search.net' in note: dblp_extra['note'] = note if part_of_key: dblp_extra['part_of_key'] = part_of_key # generic extra extra = dict() if not container_id and container_name: extra['container_name'] = container_name if series and (dblp_key_type == 'series' or dblp_type == 'book'): extra['series-title'] = series elif series: dblp_extra['series'] = series if booktitle and dblp_key_type == 'series': extra['container-title'] = booktitle elif booktitle and dblp_key_type == 'conf': extra['event'] = booktitle elif booktitle: dblp_extra['booktitle'] = booktitle if release_year and release_month: # TODO: release_month schema migration extra['release_month'] = release_month if dblp_extra: extra['dblp'] = dblp_extra if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, release_stage=release_stage, withdrawn_status=withdrawn_status, title=title, release_year=release_year, #release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, volume=volume, issue=issue, pages=pages, extra=extra, ) re = self.biblio_hacks(re) if self.dump_json_mode: re_dict = entity_to_dict(re, api_client=self.api.api_client) re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem) re_dict['_dblp_prefix'] = dblp_prefix print(json.dumps(re_dict, sort_keys=True)) return False if not re.container_id: self.counts["skip-dblp-container-missing"] += 1 return False return re