def doaj_contribs( self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ bibjson.author { affiliation (string, optional), name (string), orcid_id (string, optional) } """ contribs = [] index = 0 for author in authors: if not author.get("name"): continue creator_id = None orcid = clean_orcid(author.get("orcid_id")) if orcid: creator_id = self.lookup_orcid(orcid) contribs.append( fatcat_openapi_client.ReleaseContrib( raw_name=author.get("name"), role="author", index=index, creator_id=creator_id, raw_affiliation=clean_str(author.get("affiliation")), )) index += 1 return contribs
def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib: """ In the future, might try to implement creator key-ificiation and lookup here. Example rows: <author>Michael H. Böhlen</author> <author orcid="0000-0002-4354-9138">Nicolas Heist</author> <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author> """ creator_id = None extra = None raw_name = clean_str(elem.text) # remove number in author name, if present if raw_name.split()[-1].isdigit(): raw_name = ' '.join(raw_name.split()[:-1]) if elem.get('orcid'): orcid = clean_orcid(elem['orcid']) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: extra = dict(orcid=orcid) return fatcat_openapi_client.ReleaseContrib( raw_name=raw_name, creator_id=creator_id, extra=extra, )
def test_contributor_list_contains_contributor(): Case = collections.namedtuple("Case", "contrib_list contrib want") cases = [ Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False), ] for c in cases: got = contributor_list_contains_contributor(c.contrib_list, c.contrib) assert got == c.want
def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None if 'ORCID' in am.keys(): creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) # Sorry humans :( if am.get('given') and am.get('family'): raw_name = "{} {}".format(am['given'], am['family']) elif am.get('family'): raw_name = am['family'] else: # TODO: can end up empty raw_name = am.get('name') or am.get('given') extra = dict() if ctype == "author": index = i else: index = None raw_affiliation = None if am.get('affiliation'): if len(am.get('affiliation')) > 0: raw_affiliation = am.get('affiliation')[0]['name'] if len(am.get('affiliation')) > 1: # note: affiliation => more_affiliations extra['more_affiliations'] = [ clean(a['name']) for a in am.get('affiliation')[1:] ] if am.get('sequence') and am.get('sequence') != "additional": extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, given_name=clean(am.get('given')), surname=clean(am.get('family')), raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs
def parse_grobid_json(self, obj): if not obj.get('title'): return None extra_grobid = dict() abstract = obj.get('abstract') if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len( abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None contribs = [] for i, a in enumerate(obj.get('authors', [])): contribs.append( fatcat_openapi_client.ReleaseContrib( index=i, raw_name=clean(a['name']), given_name=clean(a.get('given_name')), surname=clean(a.get('surname')), role="author", extra=None)) refs = [] for raw in obj.get('citations', []): cite_extra = dict() year = None if raw.get('date'): try: year = int(raw['date'].strip()[:4]) except: pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): cite_extra[key] = clean(raw[key]) if raw.get('authors'): cite_extra['authors'] = [ clean(a['name']) for a in raw['authors'] ] if not cite_extra: cite_extra = None refs.append( fatcat_openapi_client.ReleaseRef(key=clean(raw.get('id')), year=year, title=clean(raw['title']), extra=cite_extra)) release_date = None release_year = None if obj.get('date'): # only returns year, ever? release_year = int(obj['date'][:4]) extra = dict() if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? if extra_grobid: extra['grobid'] = extra_grobid if self.longtail_oa: extra['longtail_oa'] = True if not extra: extra = None title = clean(obj['title'], force_xml=True) if not title or len(title) < 2: return None re = fatcat_openapi_client.ReleaseEntity( title=title, release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, publisher=clean(obj['journal'].get('publisher')), volume=clean(obj['journal'].get('volume')), issue=clean(obj['journal'].get('issue')), abstracts=abstracts, ext_ids=fatcat_openapi_client.ReleaseExtIds(), extra=extra) return re
def parse_record(self, a): medline = a.MedlineCitation # PubmedData isn't required by DTD, but seems to always be present pubmed = a.PubmedData extra = dict() extra_pubmed = dict() identifiers = pubmed.ArticleIdList pmid = medline.PMID.string.strip() doi = identifiers.find("ArticleId", IdType="doi") if doi and doi.string: doi = clean_doi(doi.string) else: doi = None pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: pmcid = clean_pmcid(pmcid.string.strip().upper()) release_type = None pub_types = [] for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): pub_types.append(pub_type.string) if pub_type.string in PUBMED_RELEASE_TYPE_MAP: release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] break if pub_types: extra_pubmed['pub_types'] = pub_types if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_type = "retraction" retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") if retraction_of: if retraction_of.RefSource: extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string if retraction_of.PMID: extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string # everything in medline is published release_stage = "published" if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): release_stage = "updated" if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_stage = "retraction" withdrawn_status = None if medline.Article.PublicationTypeList.find(string="Retracted Publication"): withdrawn_status = "retracted" elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"): withdrawn_status = "concern" pages = medline.find('MedlinePgn') if pages: pages = pages.string title = medline.Article.ArticleTitle.string # always present if title: if title.endswith('.'): title = title[:-1] # this hides some "special" titles, but the vast majority are # translations; translations don't always include the original_title if title.startswith('[') and title.endswith(']'): title = title[1:-1] else: # will filter out later title = None original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.string or None if original_title and original_title.endswith('.'): original_title = original_title[:-1] # TODO: happening in alpha order, not handling multi-language well. language = medline.Article.Language if language: language = language.string if language in ("und", "un"): # "undetermined" language = None else: language = LANG_MAP_MARC.get(language) if not language and not (medline.Article.Language.string in LANG_MAP_MARC): warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) ### Journal/Issue Metadata # MedlineJournalInfo is always present issnl = None container_id = None container_name = None container_extra = dict() mji = medline.MedlineJournalInfo if mji.find("Country"): country_name = mji.Country.string.strip() country_code = COUNTRY_NAME_MAP.get(country_name) if country_code: container_extra['country'] = country_code elif country_name: container_extra['country_name'] = country_name if mji.find("ISSNLinking"): issnl = mji.ISSNLinking.string journal = medline.Article.Journal issnp = journal.find("ISSN", IssnType="Print") if issnp: container_extra['issnp'] = issnp.string if not issnl: issnll = self.issn2issnl(issnp) if issnl: container_id = self.lookup_issnl(issnl) pub_date = medline.Article.find('ArticleDate') if not pub_date: pub_date = journal.PubDate if not pub_date: pub_date = journal.JournalIssue.PubDate release_date = None release_year = None if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): try: release_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], int(pub_date.Day.string)) release_date = release_date.isoformat() except ValueError as ve: print("bad date, skipping: {}".format(ve), file=sys.stderr) release_date = None elif pub_date.MedlineDate: medline_date = pub_date.MedlineDate.string.strip() if len(medline_date) >= 4 and medline_date[:4].isdigit(): release_year = int(medline_date[:4]) if release_year < 1300 or release_year > 2040: print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) release_year = None else: print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) if journal.find("Title"): container_name = journal.Title.string if (container_id is None and self.create_containers and (issnl is not None) and container_name): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country ce = fatcat_openapi_client.ContainerEntity( name=container_name, container_type='journal', #NOTE: publisher not included issnl=issnl, extra=(container_extra or None)) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id ji = journal.JournalIssue volume = None if ji.find("Volume"): volume = ji.Volume.string issue = None if ji.find("Issue"): issue = ji.Issue.string ### Abstracts # "All abstracts are in English" abstracts = [] primary_abstract = medline.find("Abstract") if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'): joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")]) abst = fatcat_openapi_client.ReleaseAbstract( content=joined, mimetype="text/plain", lang="en", ) if abst.content: abstracts.append(abst) elif primary_abstract: for abstract in primary_abstract.find_all("AbstractText"): abst = fatcat_openapi_client.ReleaseAbstract( content=abstract.get_text().strip(), mimetype="text/plain", lang="en", ) if abst.content: abstracts.append(abst) if abstract.find('math'): abst = fatcat_openapi_client.ReleaseAbstract( # strip the <AbstractText> tags content=str(abstract)[14:-15], mimetype="application/mathml+xml", lang="en", ) if abst.content: abstracts.append(abst) other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: lang = "en" if other.get('Language'): lang = LANG_MAP_MARC.get(other['Language']) abst = fatcat_openapi_client.ReleaseAbstract( content=other.AbstractText.get_text().strip(), mimetype="text/plain", lang=lang, ) if abst.content: abstracts.append(abst) if not abstracts: abstracts = None ### Contribs contribs = [] if medline.AuthorList: for author in medline.AuthorList.find_all("Author"): creator_id = None given_name = None surname = None raw_name = None if author.ForeName: given_name = author.ForeName.string if author.LastName: surname = author.LastName.string if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.string: raw_name = author.CollectiveName.string contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: # needs re-formatting from, eg, "0000000179841889" orcid = orcid.string if orcid.startswith("http://orcid.org/"): orcid = orcid.replace("http://orcid.org/", "") elif orcid.startswith("https://orcid.org/"): orcid = orcid.replace("https://orcid.org/", "") elif not '-' in orcid: orcid = "{}-{}-{}-{}".format( orcid[0:4], orcid[4:8], orcid[8:12], orcid[12:16], ) creator_id = self.lookup_orcid(orcid) contrib_extra['orcid'] = orcid affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: raw_affiliation = affiliations[0].string if len(affiliations) > 1: contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True contribs.append(fatcat_openapi_client.ReleaseContrib( raw_name=raw_name, given_name=given_name, surname=surname, role="author", raw_affiliation=raw_affiliation, creator_id=creator_id, extra=contrib_extra, )) if medline.AuthorList['CompleteYN'] == 'N': contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al.")) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i if not contribs: contribs = None ### References refs = [] if pubmed.ReferenceList: for ref in pubmed.ReferenceList.find_all('Reference'): ref_extra = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: ref_doi = clean_doi(ref_doi.string) ref_pmid = ref.find("ArticleId", IdType="pubmed") if ref_pmid: ref_pmid = clean_pmid(ref_pmid.string) ref_release_id = None if ref_doi: ref_extra['doi'] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) if ref_pmid: ref_extra['pmid'] = ref_pmid if self.lookup_refs: ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: ref_extra['unstructured'] = ref_raw.string if not ref_extra: ref_extra = None refs.append(fatcat_openapi_client.ReleaseRef( target_release_id=ref_release_id, extra=ref_extra, )) if not refs: refs = None # extra: # translation_of # aliases # container_name # group-title # pubmed: retraction refs if extra_pubmed: extra['pubmed'] = extra_pubmed if not extra: extra = None title = clean(title) if not title: return None re = fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, original_title=clean(original_title), release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, withdrawn_status=withdrawn_status, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, pmid=pmid, pmcid=pmcid, #isbn13 # never in Article ), volume=volume, issue=issue, pages=pages, #publisher # not included? language=language, #license_slug # not in MEDLINE abstracts=abstracts, contribs=contribs, refs=refs, container_id=container_id, extra=extra, ) return re
def parse_record(self, article): journal_meta = article.front.find("journal-meta") article_meta = article.front.find("article-meta") extra = dict() extra_jstor = dict() release_type = JSTOR_TYPE_MAP.get(article['article-type']) title = article_meta.find("article-title") if title and title.string: title = title.string.strip() elif title and not title.string: title = None if not title and release_type.startswith( 'review') and article_meta.product.source: title = "Review: {}".format(article_meta.product.source.string) if not title: return None if title.endswith('.'): title = title[:-1] if "[Abstract]" in title: # TODO: strip the "[Abstract]" bit? release_type = "abstract" elif "[Editorial" in title: release_type = "editorial" elif "[Letter" in title: release_type = "letter" elif "[Poem" in title or "[Photograph" in title: release_type = None if title.startswith("[") and title.endswith("]"): # strip brackets if that is all that is there (eg, translation or non-english) title = title[1:-1] # JSTOR journal-id journal_ids = [j.string for j in journal_meta.find_all('journal-id')] if journal_ids: extra_jstor['journal_ids'] = journal_ids journal_title = journal_meta.find("journal-title").string publisher = journal_meta.find("publisher-name").string issn = journal_meta.find("issn") if issn: issn = issn.string if len(issn) == 8: issn = "{}-{}".format(issn[0:4], issn[4:8]) else: assert len(issn) == 9 issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) # create container if it doesn't exist if (container_id is None and self.create_containers and (issnl is not None) and journal_title): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), name=clean(journal_title, force_xml=True)) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: doi = doi.string.lower().strip() jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) if jstor_id: jstor_id = jstor_id.string.strip() if not jstor_id and doi: assert doi.startswith('10.2307/') jstor_id = doi.replace('10.2307/', '') assert jstor_id and int(jstor_id) contribs = [] cgroup = article_meta.find("contrib-group") if cgroup: for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: given = clean(given.string) surname = c.find("surname") if surname: surname = clean(surname.string) raw_name = c.find("string-name") if raw_name: raw_name = clean(raw_name.string) if not raw_name: if given and surname: raw_name = "{} {}".format(given, surname) elif surname: raw_name = surname role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author')) if not role and c.get('contrib-type'): sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format( c['contrib-type'])) contribs.append( fatcat_openapi_client.ReleaseContrib( role=role, raw_name=raw_name, given_name=given, surname=surname, )) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i release_year = None release_date = None pub_date = article_meta.find('pub-date') if pub_date and pub_date.year: release_year = int(pub_date.year.string) if pub_date.month and pub_date.day: release_date = datetime.date(release_year, int(pub_date.month.string), int(pub_date.day.string)) if release_date.day == 1 and release_date.month == 1: # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them release_date = None volume = None if article_meta.volume: volume = article_meta.volume.string or None issue = None if article_meta.issue: issue = article_meta.issue.string or None pages = None if article_meta.find("page-range"): pages = article_meta.find("page-range").string elif article_meta.fpage: pages = article_meta.fpage.string language = None cm = article_meta.find("custom-meta") if cm.find("meta-name").string == "lang": language = cm.find("meta-value").string.split()[0] language = LANG_MAP_MARC.get(language) if not language: warnings.warn("MISSING MARC LANG: {}".format( cm.find("meta-value").string)) # JSTOR issue-id if article_meta.find('issue-id'): issue_id = clean(article_meta.find('issue-id').string) if issue_id: extra_jstor['issue_id'] = issue_id # everything in JSTOR is published release_stage = "published" # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # pubmed: retraction refs if extra_jstor: extra['jstor'] = extra_jstor if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( #work_id title=title, #original_title release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, jstor=jstor_id, ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=language, #license_slug # content, mimetype, lang #abstracts=abstracts, contribs=contribs, # key, year, container_name, title, locator # extra: volume, authors, issue, publisher, identifiers #refs=refs, # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_id=container_id, extra=extra, ) return re
def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. The doi parameter is only used for debugging. """ # Contributors. Many nameIdentifierSchemes, we do not use (yet): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. contribs = [] # Names, that should be ignored right away. name_blacklist = set(('Occdownload Gbif.Org', )) for i, c in enumerate(creators): if not set_index: i = None nameType = c.get('nameType', '') or '' if nameType in ('', 'Personal'): creator_id = None for nid in c.get('nameIdentifiers', []): name_scheme = nid.get('nameIdentifierScheme', '') or '' if not name_scheme.lower() == "orcid": continue orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') if not orcid: continue creator_id = self.lookup_orcid(orcid) # TODO(martin): If creator_id is None, should we create creators? # If there are multiple affiliation strings, use the first one. affiliations = c.get('affiliation', []) or [] raw_affiliation = None if len(affiliations) == 0: raw_affiliation = None else: raw_affiliation = clean(affiliations[0]) name = c.get('name') given_name = c.get('givenName') surname = c.get('familyName') if name: name = clean(name) if not any((name, given_name, surname)): continue if not name: name = "{} {}".format(given_name or '', surname or '').strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: continue # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. if name: name = index_form_to_display_name(name) if given_name: given_name = clean(given_name) if surname: surname = clean(surname) if raw_affiliation == '': continue extra = None # "DataManager", "DataCurator", "ContactPerson", "Distributor", # "RegistrationAgency", "Sponsor", "Researcher", # "RelatedPerson", "ProjectLeader", "Editor", "Other", # "ProjectMember", "Funder", "RightsHolder", "DataCollector", # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" contributorType = c.get('contributorType', '') or '' if contributorType: extra = {'type': contributorType} contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, raw_name=name, given_name=given_name, surname=surname, role=role, raw_affiliation=raw_affiliation, extra=extra, )) elif nameType == 'Organizational': name = c.get('name', '') or '' if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue extra = {'organization': name} contribs.append( fatcat_openapi_client.ReleaseContrib(index=i, extra=extra)) else: print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) return contribs
def parse_jalc_persons(raw_persons): """ For the most part, JALC DC names are in either japanese or english. The two common patterns are a list alternating between the two (in which case the names are translations), or all in one language or the other. Because dublin core is a projection tossing away a bunch of context, the other cases are hard to disambiguate. There are also some cases with Korean and other languages mixed in. This crude method doesn't handle everything right; it tries to just get the two common patterns correct. Sorry humans! Edge cases for this function: - 10.11316/jpsgaiyo.56.1.4.0_757_3 <= all english, some japanese, works - 10.14988/pa.2017.0000013531 <= complex, not japanese/english, mixed - 10.15036/arerugi.62.1407_1 <= one japanese, two english; fails - 10.14988/pa.2017.0000007327 <= ambiguous; translator in jpn/eng """ persons = [] # first parse out into language-agnostic dics for raw in raw_persons: name = raw.find('name') or None if name: name = clean(name.string) surname = raw.find('familyName') or None if surname: surname = clean(surname.string) given_name = raw.find('givenName') or None if given_name: given_name = clean(given_name.string) lang = 'en' if is_cjk(name): lang = 'ja' if lang == 'en' and surname and given_name: # english names order is flipped name = "{} {}".format(given_name, surname) rc = fatcat_openapi_client.ReleaseContrib(raw_name=name, surname=surname, given_name=given_name, role="author") # add an extra hint field; won't end up in serialized object rc._lang = lang persons.append(rc) if not persons: return [] if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]): # all english names, or all japanese names return persons # for debugging #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): # print("INTERESTING: {}".format(persons[0])) start_lang = persons[0]._lang contribs = [] for p in persons: if p._lang == start_lang: contribs.append(p) else: if p._lang == 'en' and contribs[-1]._lang == 'ja': eng = p jpn = contribs[-1] elif p._lang == 'ja' and contribs[-1]._lang == 'en': eng = contribs[-1] jpn = p else: # give up and just add as another author contribs.append(p) continue eng.extra = { 'original_name': { 'lang': jpn._lang, 'raw_name': jpn.raw_name, 'given_name': jpn.given_name, 'surname': jpn.surname, }, } contribs[-1] = eng return contribs
def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]: if not record: return None metadata = record.arXivRaw if not metadata: return None extra: Dict[str, Any] = dict() extra_arxiv: Dict[str, Any] = dict() # don't know! release_type = "article" base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: doi = clean_doi(metadata.doi.string.lower().split()[0].strip()) if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace("\n", " ")) authors = parse_arxiv_authors(metadata.authors.get_text().replace( "\n", " ")) contribs = [ fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author") for i, a in enumerate(authors) ] lang: Optional[str] = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace("\n", " ").strip() extra_arxiv["comments"] = comments if "in french" in comments.lower(): lang = "fr" elif "in spanish" in comments.lower(): lang = "es" elif "in portuguese" in comments.lower(): lang = "pt" elif "in hindi" in comments.lower(): lang = "hi" elif "in japanese" in comments.lower(): lang = "ja" elif "in german" in comments.lower(): lang = "de" elif "simplified chinese" in comments.lower(): lang = "zh" elif "in russian" in comments.lower(): lang = "ru" # more languages? number = None if metadata.find("journal-ref") and metadata.find( "journal-ref").get_text(): journal_ref = metadata.find("journal-ref").get_text().replace( "\n", " ").strip() extra_arxiv["journal_ref"] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower( ): release_type = "paper-conference" if metadata.find("report-no") and metadata.find("report-no").string: number = metadata.find("report-no").string.strip() # at least some people plop extra metadata in here. hrmf! if "ISSN " in number or "ISBN " in number or len( number.split()) > 2: extra_arxiv["report-no"] = number number = None else: release_type = "report" if metadata.find("acm-class") and metadata.find("acm-class").string: extra_arxiv["acm_class"] = metadata.find( "acm-class").string.strip() if metadata.categories and metadata.categories.get_text(): extra_arxiv["categories"] = metadata.categories.get_text().split() license_slug = None if metadata.license and metadata.license.get_text(): license_slug = lookup_license_slug(metadata.license.get_text()) abstracts = None if metadata.abstract: # TODO: test for this multi-abstract code path abstracts = [] abst = metadata.abstract.get_text().strip() orig = None if "-----" in abst: both = abst.split("-----") abst = both[0].strip() orig = both[1].strip() if "$" in abst or "{" in abst: mime = "application/x-latex" abst_plain = latex_to_text(abst) abstracts.append( fatcat_openapi_client.ReleaseAbstract( content=abst_plain, mimetype="text/plain", lang="en")) else: mime = "text/plain" abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) if orig: abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) # indicates that fulltext probably isn't english either if lang == "en": lang = None # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # arxiv: comments, categories, etc extra_arxiv["base_id"] = base_id extra["superceded"] = True extra["arxiv"] = extra_arxiv versions = [] for version in metadata.find_all("version"): arxiv_id = base_id + version["version"] release_date = version.date.string.strip() release_date = datetime.datetime.strptime( release_date, "%a, %d %b %Y %H:%M:%S %Z").date() # TODO: source_type? versions.append( ReleaseEntity( work_id=None, title=title, # original_title version=version["version"], release_type=release_type, release_stage="submitted", release_date=release_date.isoformat(), release_year=release_date.year, ext_ids=fatcat_openapi_client.ReleaseExtIds( arxiv=arxiv_id, ), number=number, language=lang, license_slug=license_slug, abstracts=abstracts, contribs=contribs, extra=extra.copy(), )) # TODO: assert that versions are actually in order? assert versions versions[-1].extra.pop("superceded") # only apply DOI to most recent version (HACK) if doi: versions[-1].ext_ids.doi = doi if len(versions) > 1: versions[-1].release_stage = "accepted" return versions
def parse_record(self, record): if not record: return None metadata = record.arXivRaw if not metadata: return None extra = dict() extra_arxiv = dict() # don't know! release_type = "article" base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: doi = metadata.doi.string.lower().split()[0].strip() if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) authors = parse_arxiv_authors(metadata.authors.get_text().replace( '\n', ' ')) contribs = [ fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors) ] lang = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace('\n', ' ').strip() extra_arxiv['comments'] = comments if 'in french' in comments.lower(): lang = 'fr' elif 'in spanish' in comments.lower(): lang = 'es' elif 'in portuguese' in comments.lower(): lang = 'pt' elif 'in hindi' in comments.lower(): lang = 'hi' elif 'in japanese' in comments.lower(): lang = 'ja' elif 'in german' in comments.lower(): lang = 'de' elif 'simplified chinese' in comments.lower(): lang = 'zh' elif 'in russian' in comments.lower(): lang = 'ru' # more languages? number = None if metadata.find('journal-ref') and metadata.find( 'journal-ref').get_text(): journal_ref = metadata.find('journal-ref').get_text().replace( '\n', ' ').strip() extra_arxiv['journal_ref'] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower( ): release_type = "paper-conference" if metadata.find('report-no') and metadata.find('report-no').string: number = metadata.find('report-no').string.strip() # at least some people plop extra metadata in here. hrmf! if 'ISSN ' in number or 'ISBN ' in number or len( number.split()) > 2: extra_arxiv['report-no'] = number number = None else: release_type = "report" if metadata.find('acm-class') and metadata.find('acm-class').string: extra_arxiv['acm_class'] = metadata.find( 'acm-class').string.strip() if metadata.categories and metadata.categories.get_text(): extra_arxiv['categories'] = metadata.categories.get_text().split() license_slug = None if metadata.license and metadata.license.get_text(): license_slug = lookup_license_slug(metadata.license.get_text()) abstracts = None if metadata.abstract: # TODO: test for this multi-abstract code path abstracts = [] abst = metadata.abstract.get_text().strip() orig = None if '-----' in abst: both = abst.split('-----') abst = both[0].strip() orig = both[1].strip() if '$' in abst or '{' in abst: mime = "application/x-latex" abst_plain = latex_to_text(abst) abstracts.append( fatcat_openapi_client.ReleaseAbstract( content=abst_plain, mimetype="text/plain", lang="en")) else: mime = "text/plain" abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) if orig: abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) # indicates that fulltext probably isn't english either if lang == 'en': lang = None # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # arxiv: comments, categories, etc extra_arxiv['base_id'] = base_id extra['superceded'] = True extra['arxiv'] = extra_arxiv versions = [] for version in metadata.find_all('version'): arxiv_id = base_id + version['version'] release_date = version.date.string.strip() release_date = datetime.datetime.strptime( release_date, "%a, %d %b %Y %H:%M:%S %Z").date() # TODO: source_type? versions.append( fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, #original_title version=version['version'], release_type=release_type, release_stage='submitted', release_date=release_date.isoformat(), release_year=release_date.year, ext_ids=fatcat_openapi_client.ReleaseExtIds( arxiv=arxiv_id, ), number=number, language=lang, license_slug=license_slug, abstracts=abstracts, contribs=contribs, extra=extra.copy(), )) # TODO: assert that versions are actually in order? assert versions versions[-1].extra.pop('superceded') # only apply DOI to most recent version (HACK) if doi: versions[-1].ext_ids.doi = doi if len(versions) > 1: versions[-1].release_stage = "accepted" return versions
def parse_datacite_creators( self, creators: List[Dict[str, Any]], role: str = "author", set_index: bool = True, doi: Optional[str] = None, ) -> List[ReleaseContrib]: """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. The doi parameter is only used for debugging. """ # Contributors. Many nameIdentifierSchemes, we do not use (yet): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. contribs: List[ReleaseContrib] = [] # Names, that should be ignored right away. name_blocklist = set(("Occdownload Gbif.Org", )) i: Optional[int] = 0 for c in creators: if not set_index: i = None nameType = c.get("nameType", "") or "" if nameType in ("", "Personal"): creator_id = None for nid in c.get("nameIdentifiers", []) or []: if not isinstance(nid, dict): # see: fatcat-workers/issues/44035/ print( "unexpected nameIdentifiers, expected list of dicts, got: {}" .format(nid), file=sys.stderr, ) continue name_scheme = nid.get("nameIdentifierScheme", "") or "" if not name_scheme.lower() == "orcid": continue orcid = nid.get("nameIdentifier") or "" orcid = orcid.replace("https://orcid.org/", "") if not orcid: continue creator_id = self.lookup_orcid(orcid) # TODO(martin): If creator_id is None, should we create creators? # If there are multiple affiliation strings, use the first one. affiliations = c.get("affiliation", []) or [] raw_affiliation = None if len(affiliations) == 0: raw_affiliation = None else: raw_affiliation = clean_str(affiliations[0]) name = c.get("name") given_name = c.get("givenName") surname = c.get("familyName") if name: name = clean_str(name) if not any((name, given_name, surname)): continue if not name: name = "{} {}".format(given_name or "", surname or "").strip() if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: continue # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. if name: name = index_form_to_display_name(name) if given_name: given_name = clean_str(given_name) surname = clean_str(surname) # Perform a final assertion that name does not reduce to zero # (e.g. whitespace only name). if name: name = name.strip() if not name: continue if raw_affiliation == "": continue extra = None # "DataManager", "DataCurator", "ContactPerson", "Distributor", # "RegistrationAgency", "Sponsor", "Researcher", # "RelatedPerson", "ProjectLeader", "Editor", "Other", # "ProjectMember", "Funder", "RightsHolder", "DataCollector", # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" contributorType = c.get("contributorType", "") or "" if contributorType: extra = {"type": contributorType} rc = fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, raw_name=name, given_name=given_name, surname=surname, role=role, raw_affiliation=raw_affiliation, extra=extra, ) # Filter out duplicates early. if not contributor_list_contains_contributor(contribs, rc): contribs.append(rc) if i is not None: i += 1 elif nameType == "Organizational": name = c.get("name", "") or "" if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue extra = {"organization": name} contribs.append( fatcat_openapi_client.ReleaseContrib(index=i, extra=extra)) if i is not None: i += 1 else: print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr) return contribs