def biblio_to_release(biblio: dict) -> ReleaseEntity: """ Helper for close_fuzzy_biblio_matches() et al """ contribs = [] if biblio.get('authors'): for a in biblio['authors']: contribs.append( ReleaseContrib( raw_name=a.get('name'), given_name=a.get('given_name'), surname=a.get('surname'), )) elif biblio.get('author_names'): for a in biblio['author_names']: contribs.append(ReleaseContrib(raw_name=a)) elif biblio.get('first_author'): contribs.append(ReleaseContrib(raw_name=biblio['first_author'])) release = ReleaseEntity( title=biblio.get("title"), ext_ids=ReleaseExtIds( doi=clean_doi(biblio.get("doi")), pmid=biblio.get("pmid"), pmcid=biblio.get("pmcid"), arxiv=biblio.get("arxiv_id"), ), volume=biblio.get("volume"), issue=biblio.get("issue"), pages=biblio.get("pages") or biblio.get("first_page"), publisher=biblio.get("publisher"), release_stage=biblio.get("release_stage"), release_type=biblio.get("release_type"), extra=dict(), ) if biblio.get('journal'): release.extra['container_name'] = biblio['journal'] elif biblio.get('conference'): release.extra['container_name'] = biblio['conference'] if biblio.get('year'): year = biblio['year'] if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit(): release.release_year = int(year[0:4]) elif isinstance(year, int): release.release_year = year elif biblio.get('date'): date = biblio['date'] if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit(): release.release_year = int(date[0:4]) return release
def grobid_ref_to_release(ref: dict) -> ReleaseEntity: """ Takes the dict returned by transform_grobid_ref_xml() and returns a partial ReleaseEntity object (for use with fuzzycat) """ contribs = [] for author in ref.get("authors") or []: contribs.append( ReleaseContrib( raw_name=author.get("name"), given_name=author.get("given_name"), surname=author.get("surname"), )) release = ReleaseEntity( title=ref.get("title"), contribs=contribs, volume=ref.get("volume"), issue=ref.get("issue"), pages=ref.get("pages"), ext_ids=ReleaseExtIds( doi=clean_doi(ref.get("doi")), pmid=ref.get("pmid"), pmcid=ref.get("pmcid"), arxiv=ref.get("arxiv_id"), ), ) if ref.get("journal"): release.extra = {"container_name": ref.get("journal")} if ref.get("date"): if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): release.release_year = int(ref["date"][0:4]) # TODO: try to parse 'date' into an ISO date format, and assign to release_date? return release
def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity: contribs = [] for author in ref.authors or []: contribs.append( ReleaseContrib( raw_name=author.full_name, given_name=author.given_name, surname=author.surname, ) ) release = ReleaseEntity( title=ref.title, contribs=contribs, volume=ref.volume, issue=ref.issue, pages=ref.pages, ext_ids=ReleaseExtIds( doi=ref.doi, pmid=ref.pmid, pmcid=ref.pmcid, arxiv=ref.arxiv_id, ), ) if ref.journal: release.extra = {"container_name": ref.journal} if ref.date: if len(ref.date) == 4 and ref.date.isdigit(): release.release_year = int(ref.date) return release
def ref_to_release(ref: dict) -> ReleaseEntity: contribs = [] for author in ref.get("authors") or []: contribs.append( ReleaseContrib( raw_name=author.get("name"), given_name=author.get("given_name"), surname=author.get("surname"), )) release = ReleaseEntity( title=ref.get("title"), contribs=contribs, volume=ref.get("volume"), issue=ref.get("issue"), pages=ref.get("pages"), ext_ids=ReleaseExtIds( doi=ref.get("doi"), pmid=ref.get("pmid"), pmcid=ref.get("pmcid"), arxiv=ref.get("arxiv_id"), ), ) if ref.get("journal"): release.extra = {"container_name": ref.get("journal")} if ref.get("date"): if len(ref["date"]) == 4 and ref["date"].isdigit(): release.release_year = int(ref["date"]) return release
def test_fuzzy_match_different(entity_importer, mocker) -> None: """ Simple fuzzycat-mocked test for "strong match" case """ r1 = ReleaseEntity( title="example title: novel work", contribs=[ReleaseContrib(raw_name="robin hood")], ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"), ) r2 = ReleaseEntity( title="Example Title: Novel Work?", contribs=[ReleaseContrib(raw_name="robin hood")], ext_ids=ReleaseExtIds(), ) r3 = ReleaseEntity( title="entirely different", contribs=[ReleaseContrib(raw_name="king tut")], ext_ids=ReleaseExtIds(), ) match_raw = mocker.patch( "fatcat_tools.importers.common.match_release_fuzzy") match_raw.side_effect = [[r3, r2, r3, r2]] resp = entity_importer.match_existing_release_fuzzy(r1) assert (resp[0], resp[2]) == ("STRONG", r2) match_raw.side_effect = [[r2, r2, r3, r1]] resp = entity_importer.match_existing_release_fuzzy(r1) assert (resp[0], resp[2]) == ("EXACT", r1) match_raw.side_effect = [[r3]] resp = entity_importer.match_existing_release_fuzzy(r1) assert resp is None match_raw.side_effect = [[]] resp = entity_importer.match_existing_release_fuzzy(r1) assert resp is None
def do_contribs(obj_list: List[Dict[str, Any]], ctype: str) -> List[ReleaseContrib]: contribs = [] for i, am in enumerate(obj_list): creator_id = None if "ORCID" in am.keys(): creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( if am.get("given") and am.get("family"): raw_name: Optional[str] = "{} {}".format( am["given"], am["family"]) elif am.get("family"): raw_name = am["family"] else: # TODO: can end up empty raw_name = am.get("name") or am.get("given") extra: Dict[str, Any] = dict() if ctype == "author": index: Optional[int] = i else: index = None raw_affiliation = None affiliation_list = am.get("affiliation") or [] # TODO: currently requiring a "name" in all affiliations. Could # add ROR support (via identifier) in the near future affiliation_list = [a for a in affiliation_list if "name" in a] if affiliation_list and len(affiliation_list) > 0: raw_affiliation = affiliation_list[0]["name"] if len(affiliation_list) > 1: # note: affiliation => more_affiliations extra["more_affiliations"] = [ clean_str(a["name"]) for a in affiliation_list[1:] ] if am.get("sequence") and am.get("sequence") != "additional": extra["seq"] = clean_str(am.get("sequence")) assert ctype in ("author", "editor", "translator") raw_name = clean_str(raw_name) # TODO: what if 'raw_name' is None? contribs.append( ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, given_name=clean_str(am.get("given")), surname=clean_str(am.get("family")), raw_affiliation=clean_str(raw_affiliation), role=ctype, extra=extra or None, )) return contribs
def update_entity(self, re: ReleaseEntity) -> None: """ Mutates a release entity in place, updating fields with values from this form. Form must be validated *before* calling this function. """ for simple_attr in RELEASE_SIMPLE_ATTRS: a = getattr(self, simple_attr).data # special case blank strings if a == "": a = None setattr(re, simple_attr, a) for extid_attr in RELEASE_EXTID_ATTRS: a = getattr(self, extid_attr).data # special case blank strings if a == "": a = None setattr(re.ext_ids, extid_attr, a) if self.release_date.data: re.release_year = self.release_date.data.year # bunch of complexity here to preserve old contrib metadata (eg, # affiliation and extra) not included in current forms # TODO: this may be broken; either way needs tests if re.contribs: old_contribs = re.contribs.copy() re.contribs = [] else: old_contribs = [] re.contribs = [] for c in self.contribs: if c.prev_index.data not in ("", None): rc = old_contribs[int(c.prev_index.data)] rc.role = c.role.data or None rc.raw_name = c.raw_name.data or None else: rc = ReleaseContrib( role=c.role.data or None, raw_name=c.raw_name.data or None, ) re.contribs.append(rc) if self.edit_description.data: re.edit_extra = dict(description=self.edit_description.data)
def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: """ For the most part, JALC DC names are in either japanese or english. The two common patterns are a list alternating between the two (in which case the names are translations), or all in one language or the other. Because dublin core is a projection tossing away a bunch of context, the other cases are hard to disambiguate. There are also some cases with Korean and other languages mixed in. This crude method doesn't handle everything right; it tries to just get the two common patterns correct. Sorry humans! Edge cases for this function: - 10.11316/jpsgaiyo.56.1.4.0_757_3 <= all english, some japanese, works - 10.14988/pa.2017.0000013531 <= complex, not japanese/english, mixed - 10.15036/arerugi.62.1407_1 <= one japanese, two english; fails - 10.14988/pa.2017.0000007327 <= ambiguous; translator in jpn/eng """ persons = [] # first parse out into language-agnostic dics for raw in raw_persons: name = raw.find("name") or None if name: name = clean_str(name.get_text().replace("\n", " ")) surname = raw.find("familyName") or None if surname: surname = clean_str(surname.get_text().replace("\n", " ")) given_name = raw.find("givenName") or None if given_name: given_name = clean_str(given_name.get_text().replace("\n", " ")) lang = "en" if is_cjk(name): lang = "ja" if lang == "en" and surname and given_name: # english names order is flipped name = "{} {}".format(given_name, surname) rc = ReleaseContrib(raw_name=name, surname=surname, given_name=given_name, role="author") # add an extra hint field; won't end up in serialized object rc._lang = lang persons.append(rc) if not persons: return [] if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]): # all english names, or all japanese names return persons # for debugging # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): # print("INTERESTING: {}".format(persons[0])) start_lang = persons[0]._lang contribs = [] for p in persons: if p._lang == start_lang: contribs.append(p) else: if p._lang == "en" and contribs[-1]._lang == "ja": eng = p jpn = contribs[-1] elif p._lang == "ja" and contribs[-1]._lang == "en": eng = contribs[-1] jpn = p else: # give up and just add as another author contribs.append(p) continue eng.extra = { "original_name": { "lang": jpn._lang, "raw_name": jpn.raw_name, "given_name": jpn.given_name, "surname": jpn.surname, }, } contribs[-1] = eng return contribs