def parse_record(self, obj): """ Mapping datacite JSON to ReleaseEntity. """ if not obj or not isinstance(obj, dict): return None if 'attributes' not in obj: return None attributes = obj['attributes'] doi = clean_doi(attributes.get('doi', '').lower()) if not doi: print('skipping record without a DOI', file=sys.stderr) return if not isascii(doi): print('[{}] skipping non-ascii doi for now'.format(doi)) return None creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [ ] # Much fewer than creators. contribs = self.parse_datacite_creators( creators, doi=doi) + self.parse_datacite_creators( contributors, role=None, set_index=False, doi=doi) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" titles = attributes.get('titles', []) or [] title, original_language_title, subtitle = parse_datacite_titles( titles) if title is None: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False title = clean(title) if not title: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False if not subtitle: subtitle = None else: subtitle = clean(subtitle) # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): release_date, release_month, release_year = parse_single_date( attributes.get('publicationYear')) if not any((release_date, release_month, release_year)): release_date, release_month, release_year = parse_single_date( attributes.get('published')) if not any((release_date, release_month, release_year)): print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". release_stage = 'published' # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: # https://support.datacite.org/docs/doi-states. # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, # but factored out; first we have to log misses. Example: # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt # werden" publisher = None if publisher: publisher = clean(publisher) # Container. For the moment, only ISSN as container. container_id = None container_name = None container = attributes.get('container', {}) or {} if container.get('type') in CONTAINER_TYPE_MAP.keys(): container_type = CONTAINER_TYPE_MAP.get(container['type']) if container.get('identifier') and container.get( 'identifierType') == 'ISSN': issn = container.get('identifier') if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] issnl = self.issn2issnl(issn) if issnl is not None: container_id = self.lookup_issnl(issnl) if container_id is None and container.get('title'): container_name = container.get('title') if isinstance(container_name, list): if len(container_name) > 0: print('[{}] too many container titles: {}'. format(doi, len(container_name))) container_name = container_name[0] assert isinstance(container_name, str) ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, container_type=container_type, name=container_name, ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id else: # TODO(martin): factor this out into a testable function. # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 container_name = container.get('title') if isinstance(container_name, list): if len(container_name) > 0: print('[{}] too many container titles: {}'.format( doi, len(container_name))) container_name = container_name[0] # Exception: https://www.micropublication.org/, see: !MR24. if container_id is None and container_name is None: if publisher and publisher.lower().startswith('micropublication'): container_name = publisher # Volume and issue. volume = container.get('volume') issue = container.get('issue') if volume: volume = clean(volume) if issue: issue = clean(issue) # Pages. pages = None first_page = container.get('firstPage') last_page = container.get('lastPage') if first_page and last_page: try: _ = int(first_page) < int(last_page) pages = '{}-{}'.format(first_page, last_page) except ValueError as err: # TODO(martin): This is more debug than info. # print('[{}] {}'.format(doi, err), file=sys.stderr) pass if not pages and first_page: pages = first_page # License. license_slug = None license_extra = [] for l in attributes.get('rightsList', []): slug = lookup_license_slug(l.get('rightsUri')) if slug: license_slug = slug license_extra.append(l) # Release type. Try to determine the release type from a variety of # types supplied in datacite. The "attributes.types.resourceType" is # uncontrolled (170000+ unique values, from "null", "Dataset" to # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP # flows in 2009") citeproc may be the closest, but not always supplied. # Order lookup roughly by completeness of mapping. for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): value = attributes.get('types', {}).get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break if release_type is None: print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) # release_type exception: Global Biodiversity Information Facility # publishes highly interesting datasets, but titles are mostly the same # ("GBIF Occurrence Download" or "Occurrence Download"); set # release_type to "stub" (CSL/FC). if publisher == 'The Global Biodiversity Information Facility': release_type = 'stub' # release_type exception: lots of "Experimental Crystal Structure Determination" if publisher == 'Cambridge Crystallographic Data Centre': release_type = 'entry' # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." if title.lower().startswith('additional file'): release_type = 'stub' # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's # right. Sometimes you have a boring problem, and it's great when a # library solves it for you." -- TODO(martin): We need more of these. language = None value = attributes.get('language', '') or '' try: language = pycountry.languages.lookup(value).alpha_2 except (LookupError, AttributeError) as err: pass # TODO(martin): Print this on debug level, only. # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr) # Abstracts appear in "attributes.descriptions[].descriptionType", some # of the observed values: "Methods", "TechnicalInfo", # "SeriesInformation", "Other", "TableOfContents", "Abstract". The # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] descs = attributes.get('descriptions', []) or [] for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue # Description maybe a string or list. text = desc.get('description', '') if not text: continue if isinstance(text, list): try: text = "\n".join(text) except TypeError as err: continue # Bail out, if it is not a list of strings. # Limit length. if len(text) < 10: continue if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: print('[{}] language detection failed with {} on {}'.format( doi, err, text), file=sys.stderr) abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=clean(text), lang=lang, )) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", # "IsDerivedFrom", "IsSourceOf". # # For the moment, we only care about References. refs, ref_index = [], 0 relIds = attributes.get('relatedIdentifiers', []) or [] for rel in relIds: if not rel.get('relationType', '') in ('References', 'Cites'): continue ref_extra = dict() if rel.get('relatedIdentifierType', '') == 'DOI': ref_extra['doi'] = rel.get('relatedIdentifier') if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, extra=ref_extra, )) ref_index += 1 # More specific release_type via 'Reviews' relationsship. for rel in relIds: if rel.get('relatedIdentifierType', '') != 'Reviews': continue release_type = 'review' # Extra information. extra_datacite = dict() if license_extra: extra_datacite['license'] = license_extra if attributes.get('subjects'): extra_datacite['subjects'] = attributes['subjects'] # Include version information. metadata_version = attributes.get('metadataVersion') or '' if metadata_version: extra_datacite['metadataVersion'] = metadata_version # Include resource types. types = attributes.get('types', {}) or {} resource_type = types.get('resourceType', '') or '' resource_type_general = types.get('resourceTypeGeneral', '') or '' if resource_type and resource_type.lower( ) not in UNKNOWN_MARKERS_LOWER: extra_datacite['resourceType'] = resource_type if resource_type_general and resource_type_general.lower( ) not in UNKNOWN_MARKERS_LOWER: extra_datacite['resourceTypeGeneral'] = resource_type_general # Include certain relations from relatedIdentifiers. Keeping the # original structure of data here, which is a list of dicts, with # relation type, identifer and identifier type (mostly). relations = [] for rel in relIds: if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', 'IsVariantFormOf', 'IsSupplementTo', 'HasVersion', 'IsMetadataFor', 'IsNewVersionOf', 'IsIdenticalTo', 'IsVersionOf', 'IsDerivedFrom', 'IsSourceOf'): relations.append(rel) if relations: extra_datacite['relations'] = relations extra = dict() # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", # "10161", "10010691", "10780", # "Presentación" version = attributes.get('version') # top-level extra keys if not container_id and container_name: extra['container_name'] = container_name # Always include datacite key, even if value is empty (dict). extra['datacite'] = extra_datacite # Preparation for a schema update. if release_month: extra['release_month'] = release_month extids = self.lookup_ext_ids(doi=doi) # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, release_stage=release_stage, title=title, subtitle=subtitle, original_title=original_language_title, release_year=release_year, release_date=release_date, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], core=extids['core_id'], arxiv=extids['arxiv_id'], jstor=extids['jstor_id'], ), contribs=contribs, volume=volume, issue=issue, pages=pages, language=language, abstracts=abstracts, refs=refs, extra=extra, license_slug=license_slug, version=version, ) return re
def parse_record(self, article): journal_meta = article.front.find("journal-meta") article_meta = article.front.find("article-meta") extra = dict() extra_jstor = dict() release_type = JSTOR_TYPE_MAP.get(article['article-type']) title = article_meta.find("article-title") if title and title.string: title = title.string.strip() elif title and not title.string: title = None if not title and release_type.startswith( 'review') and article_meta.product.source: title = "Review: {}".format(article_meta.product.source.string) if not title: return None if title.endswith('.'): title = title[:-1] if "[Abstract]" in title: # TODO: strip the "[Abstract]" bit? release_type = "abstract" elif "[Editorial" in title: release_type = "editorial" elif "[Letter" in title: release_type = "letter" elif "[Poem" in title or "[Photograph" in title: release_type = None if title.startswith("[") and title.endswith("]"): # strip brackets if that is all that is there (eg, translation or non-english) title = title[1:-1] # JSTOR journal-id journal_ids = [j.string for j in journal_meta.find_all('journal-id')] if journal_ids: extra_jstor['journal_ids'] = journal_ids journal_title = journal_meta.find("journal-title").string publisher = journal_meta.find("publisher-name").string issn = journal_meta.find("issn") if issn: issn = issn.string if len(issn) == 8: issn = "{}-{}".format(issn[0:4], issn[4:8]) else: assert len(issn) == 9 issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) # create container if it doesn't exist if (container_id is None and self.create_containers and (issnl is not None) and journal_title): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), name=clean(journal_title, force_xml=True)) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: doi = doi.string.lower().strip() jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) if jstor_id: jstor_id = jstor_id.string.strip() if not jstor_id and doi: assert doi.startswith('10.2307/') jstor_id = doi.replace('10.2307/', '') assert jstor_id and int(jstor_id) contribs = [] cgroup = article_meta.find("contrib-group") if cgroup: for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: given = clean(given.string) surname = c.find("surname") if surname: surname = clean(surname.string) raw_name = c.find("string-name") if raw_name: raw_name = clean(raw_name.string) if not raw_name: if given and surname: raw_name = "{} {}".format(given, surname) elif surname: raw_name = surname role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author')) if not role and c.get('contrib-type'): sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format( c['contrib-type'])) contribs.append( fatcat_openapi_client.ReleaseContrib( role=role, raw_name=raw_name, given_name=given, surname=surname, )) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i release_year = None release_date = None pub_date = article_meta.find('pub-date') if pub_date and pub_date.year: release_year = int(pub_date.year.string) if pub_date.month and pub_date.day: release_date = datetime.date(release_year, int(pub_date.month.string), int(pub_date.day.string)) if release_date.day == 1 and release_date.month == 1: # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them release_date = None volume = None if article_meta.volume: volume = article_meta.volume.string or None issue = None if article_meta.issue: issue = article_meta.issue.string or None pages = None if article_meta.find("page-range"): pages = article_meta.find("page-range").string elif article_meta.fpage: pages = article_meta.fpage.string language = None cm = article_meta.find("custom-meta") if cm.find("meta-name").string == "lang": language = cm.find("meta-value").string.split()[0] language = LANG_MAP_MARC.get(language) if not language: warnings.warn("MISSING MARC LANG: {}".format( cm.find("meta-value").string)) # JSTOR issue-id if article_meta.find('issue-id'): issue_id = clean(article_meta.find('issue-id').string) if issue_id: extra_jstor['issue_id'] = issue_id # everything in JSTOR is published release_stage = "published" # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # pubmed: retraction refs if extra_jstor: extra['jstor'] = extra_jstor if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( #work_id title=title, #original_title release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, jstor=jstor_id, ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=language, #license_slug # content, mimetype, lang #abstracts=abstracts, contribs=contribs, # key, year, container_name, title, locator # extra: volume, authors, issue, publisher, identifiers #refs=refs, # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_id=container_id, extra=extra, ) return re
def parse_record(self, obj): """ Mapping datacite JSON to ReleaseEntity. """ if not obj or not isinstance(obj, dict): return None if 'attributes' not in obj: return None attributes = obj['attributes'] doi = clean_doi(attributes.get('doi', '').lower()) if not doi: print('skipping record without a DOI', file=sys.stderr) return if not str.isascii(doi): print('[{}] skipping non-ascii doi for now'.format(doi)) return None creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [ ] # Much fewer than creators. contribs = self.parse_datacite_creators(creators, doi=doi) # Beside creators, we have contributors in datacite. Sample: # ContactPerson, DataCollector, DataCurator, DataManager, Distributor, # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader, # ProjectMember, RelatedPerson, ResearchGroup, Researcher, # RightsHolder, Sponsor, Supervisor # # Datacite schema: # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32 # -- could be used as a form of controlled vocab? # # Currently (07/2020) in release_contrib: # # select count(*), role from release_contrib group by role; # count | role # -----------+------------ # 500269665 | author # 4386563 | editor # 17871 | translator # 10870584 | # (4 rows) # # Related: https://guide.fatcat.wiki/entity_release.html -- role # (string, of a set): the type of contribution, from a controlled # vocabulary. TODO: vocabulary needs review. contribs_extra_contributors = self.parse_datacite_creators( contributors, set_index=False, doi=doi) # Unfortunately, creators and contributors might overlap, refs GH59. for cc in contribs_extra_contributors: if contributor_list_contains_contributor(contribs, cc): continue contribs.append(cc) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" titles = attributes.get('titles', []) or [] title, original_language_title, subtitle = parse_datacite_titles( titles) if title is None: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False title = clean(title) if not title: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() for token in rule.get("tokens", []): if token in title.lower(): seen.add(token) if len(seen) >= rule.get("min"): print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr) return False if not subtitle: subtitle = None else: subtitle = clean(subtitle) # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_date = None release_month = None release_year = None # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): release_date, release_month, release_year = parse_single_date( attributes.get('publicationYear')) if not any((release_date, release_month, release_year)): release_date, release_month, release_year = parse_single_date( attributes.get('published')) if not any((release_date, release_month, release_year)): print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". release_stage = 'published' # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: # https://support.datacite.org/docs/doi-states. # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, # but factored out; first we have to log misses. Example: # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt # werden" publisher = None if publisher: publisher = clean(publisher) # Container. For the moment, only ISSN as container. container_id = None container_name = None container = attributes.get('container', {}) or {} if container.get('type') in CONTAINER_TYPE_MAP.keys(): container_type = CONTAINER_TYPE_MAP.get(container['type']) if container.get('identifier') and container.get( 'identifierType') == 'ISSN': issn = container.get('identifier') if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] issnl = self.issn2issnl(issn) if issnl is not None: container_id = self.lookup_issnl(issnl) if container_id is None and container.get('title'): container_name = container.get('title') if isinstance(container_name, list): if len(container_name) > 0: print('[{}] too many container titles: {}'. format(doi, len(container_name))) container_name = container_name[0] assert isinstance(container_name, str) ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, container_type=container_type, name=container_name, ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id else: # TODO(martin): factor this out into a testable function. # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 container_name = container.get('title') if isinstance(container_name, list): if len(container_name) > 0: print('[{}] too many container titles: {}'.format( doi, len(container_name))) container_name = container_name[0] # Exception: https://www.micropublication.org/, see: !MR24. if container_id is None and container_name is None: if publisher and publisher.lower().startswith('micropublication'): container_name = publisher # Volume and issue. volume = container.get('volume') issue = container.get('issue') if volume: volume = clean(volume) if issue: issue = clean(issue) # Pages. pages = None first_page = container.get('firstPage') last_page = container.get('lastPage') if first_page and last_page: try: _ = int(first_page) < int(last_page) pages = '{}-{}'.format(first_page, last_page) except ValueError as err: # noqa: F841 # TODO(martin): This is more debug than info. # print('[{}] {}'.format(doi, err), file=sys.stderr) pass if not pages and first_page: pages = first_page # License. license_slug = None license_extra = [] for lic in attributes.get('rightsList', []): slug = lookup_license_slug(lic.get('rightsUri')) if slug: license_slug = slug license_extra.append(lic) release_type = self.datacite_release_type(doi, attributes) # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's # right. Sometimes you have a boring problem, and it's great when a # library solves it for you." -- TODO(martin): We need more of these. language = None value = attributes.get('language', '') or '' try: language = pycountry.languages.lookup(value).alpha_2 except (LookupError, AttributeError) as err: # noqa: F841 pass # TODO(martin): Print this on debug level, only. # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr) # Abstracts appear in "attributes.descriptions[].descriptionType", some # of the observed values: "Methods", "TechnicalInfo", # "SeriesInformation", "Other", "TableOfContents", "Abstract". The # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] descs = attributes.get('descriptions', []) or [] for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue # Description maybe a string, int or list. text = desc.get('description', '') if not text: continue if isinstance(text, int): text = '{}'.format(text) if isinstance(text, list): try: text = "\n".join(text) except TypeError: continue # Bail out, if it is not a list of strings. # Limit length. if len(text) < 10: continue if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: print('[{}] language detection failed with {} on {}'.format( doi, err, text), file=sys.stderr) abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=clean(text), lang=lang, )) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", # "IsDerivedFrom", "IsSourceOf". # # For the moment, we only care about References. refs, ref_index = [], 0 relIds = attributes.get('relatedIdentifiers', []) or [] for rel in relIds: if not rel.get('relationType', '') in ('References', 'Cites'): continue ref_extra = dict() if rel.get('relatedIdentifierType', '') == 'DOI': ref_extra['doi'] = rel.get('relatedIdentifier') if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, extra=ref_extra, )) ref_index += 1 # More specific release_type via 'Reviews' relationsship. for rel in relIds: if rel.get('relatedIdentifierType', '') != 'Reviews': continue release_type = 'review' # Extra information. extra_datacite = dict() if license_extra: extra_datacite['license'] = license_extra if attributes.get('subjects'): extra_datacite['subjects'] = attributes['subjects'] # Include version information. metadata_version = attributes.get('metadataVersion') or '' if metadata_version: extra_datacite['metadataVersion'] = metadata_version # Include resource types. types = attributes.get('types', {}) or {} resource_type = types.get('resourceType', '') or '' resource_type_general = types.get('resourceTypeGeneral', '') or '' if resource_type and resource_type.lower( ) not in UNKNOWN_MARKERS_LOWER: extra_datacite['resourceType'] = resource_type if resource_type_general and resource_type_general.lower( ) not in UNKNOWN_MARKERS_LOWER: extra_datacite['resourceTypeGeneral'] = resource_type_general # Include certain relations from relatedIdentifiers. Keeping the # original structure of data here, which is a list of dicts, with # relation type, identifier and identifier type (mostly). relations = [] for rel in relIds: if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', 'IsVariantFormOf', 'IsSupplementTo', 'HasVersion', 'IsMetadataFor', 'IsNewVersionOf', 'IsIdenticalTo', 'IsVersionOf', 'IsDerivedFrom', 'IsSourceOf'): relations.append(rel) if relations: extra_datacite['relations'] = relations extra = dict() # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", # "10161", "10010691", "10780", # "Presentación" version = attributes.get('version') or None # top-level extra keys if not container_id and container_name: extra['container_name'] = container_name # Always include datacite key, even if value is empty (dict). extra['datacite'] = extra_datacite # Preparation for a schema update. if release_month: extra['release_month'] = release_month extids = self.lookup_ext_ids(doi=doi) # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, release_stage=release_stage, title=title, subtitle=subtitle, original_title=original_language_title, release_year=release_year, release_date=release_date, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], core=extids['core_id'], arxiv=extids['arxiv_id'], jstor=extids['jstor_id'], ), contribs=contribs, volume=volume, issue=issue, pages=pages, language=language, abstracts=abstracts, refs=refs, extra=extra, license_slug=license_slug, version=version, ) re = self.biblio_hacks(re) return re
def parse_record(self, record): """ record is a beautiful soup object returns a ReleaseEntity, or None In JALC metadata, both English and Japanese records are given for most fields. """ extra = dict() extra_jalc = dict() titles = record.find_all("title") if not titles: return None title = titles[0].string.strip() original_title = None if title.endswith('.'): title = title[:-1] if len(titles) > 1: original_title = titles[1].string.strip() if original_title.endswith('.'): original_title = original_title[:-1] doi = None if record.doi: doi = record.doi.string.lower().strip() if doi.startswith('http://dx.doi.org/'): doi = doi.replace('http://dx.doi.org/', '') elif doi.startswith('https://dx.doi.org/'): doi = doi.replace('https://dx.doi.org/', '') elif doi.startswith('http://doi.org/'): doi = doi.replace('http://doi.org/', '') elif doi.startswith('https://doi.org/'): doi = doi.replace('https://doi.org/', '') if not (doi.startswith('10.') and '/' in doi): sys.stderr.write("bogus JALC DOI: {}\n".format(doi)) doi = None if not doi: return None people = record.find_all("Person") contribs = parse_jalc_persons(people) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i release_year = None release_date = None date = record.date or None if date: date = date.string if len(date) is 10: release_date = datetime.datetime.strptime( date['completed-date'], DATE_FMT).date() release_year = release_date.year release_date = release_date.isoformat() elif len(date) is 4 and date.isdigit(): release_year = int(date) pages = None if record.startingPage: pages = record.startingPage.string if record.endingPage: pages = "{}-{}".format(pages, record.endingPage.string) volume = None if record.volume: volume = record.volume.string issue = None if record.number: # note: number/issue transform issue = record.number.string # container issn = None issn_list = record.find_all("issn") if issn_list: # if we wanted the other ISSNs, would also need to uniq the list. # But we only need one to lookup ISSN-L/container issn = issn_list[0].string issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) publisher = None container_name = None container_extra = dict() if record.publicationName: pubs = [ p.string.strip() for p in record.find_all("publicationName") if p.string ] pubs = [clean(p) for p in pubs if p] assert (pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] container_name = clean(pubs[0]) if len(pubs) > 1: container_extra['original_name'] = clean(pubs[1]) if record.publisher: pubs = [ p.string.strip() for p in record.find_all("publisher") if p.string ] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # ordering is not reliable pubs = [pubs[1], pubs[0]] if pubs: publisher = clean(pubs[0]) if len(pubs) > 1: container_extra['publisher_aliases'] = pubs[1:] if (container_id is None and self.create_containers and (issnl is not None) and container_name): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_extra['country'] = 'jp' container_extra['languages'] = ['ja'] ce = fatcat_openapi_client.ContainerEntity( name=container_name, container_type='journal', publisher=publisher, issnl=issnl, extra=(container_extra or None)) ce_edit = self.create_container(ce) container_id = ce_edit.ident # short-cut future imports in same batch self._issnl_id_map[issnl] = container_id # the vast majority of works are in japanese # TODO: any indication when *not* in japanese? lang = "ja" # reasonable default for this collection release_type = "article-journal" # external identifiers extids = self.lookup_ext_ids(doi=doi) # extra: # translation_of # aliases # container_name # group-title # always put at least an empty dict here to indicate the DOI registrar # (informally) extra['jalc'] = extra_jalc title = clean(title) if not title: return None re = fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, original_title=clean(original_title), release_type="article-journal", release_stage='published', release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], core=extids['core_id'], arxiv=extids['arxiv_id'], jstor=extids['jstor_id'], ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=lang, #license_slug container_id=container_id, contribs=contribs, extra=extra, ) return re
def parse_record(self, record): if not record: return None metadata = record.arXivRaw if not metadata: return None extra = dict() extra_arxiv = dict() # don't know! release_type = "article" base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: doi = metadata.doi.string.lower().split()[0].strip() if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) authors = parse_arxiv_authors(metadata.authors.get_text().replace( '\n', ' ')) contribs = [ fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors) ] lang = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace('\n', ' ').strip() extra_arxiv['comments'] = comments if 'in french' in comments.lower(): lang = 'fr' elif 'in spanish' in comments.lower(): lang = 'es' elif 'in portuguese' in comments.lower(): lang = 'pt' elif 'in hindi' in comments.lower(): lang = 'hi' elif 'in japanese' in comments.lower(): lang = 'ja' elif 'in german' in comments.lower(): lang = 'de' elif 'simplified chinese' in comments.lower(): lang = 'zh' elif 'in russian' in comments.lower(): lang = 'ru' # more languages? number = None if metadata.find('journal-ref') and metadata.find( 'journal-ref').get_text(): journal_ref = metadata.find('journal-ref').get_text().replace( '\n', ' ').strip() extra_arxiv['journal_ref'] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower( ): release_type = "paper-conference" if metadata.find('report-no') and metadata.find('report-no').string: number = metadata.find('report-no').string.strip() # at least some people plop extra metadata in here. hrmf! if 'ISSN ' in number or 'ISBN ' in number or len( number.split()) > 2: extra_arxiv['report-no'] = number number = None else: release_type = "report" if metadata.find('acm-class') and metadata.find('acm-class').string: extra_arxiv['acm_class'] = metadata.find( 'acm-class').string.strip() if metadata.categories and metadata.categories.get_text(): extra_arxiv['categories'] = metadata.categories.get_text().split() license_slug = None if metadata.license and metadata.license.get_text(): license_slug = lookup_license_slug(metadata.license.get_text()) abstracts = None if metadata.abstract: # TODO: test for this multi-abstract code path abstracts = [] abst = metadata.abstract.get_text().strip() orig = None if '-----' in abst: both = abst.split('-----') abst = both[0].strip() orig = both[1].strip() if '$' in abst or '{' in abst: mime = "application/x-latex" abst_plain = latex_to_text(abst) abstracts.append( fatcat_openapi_client.ReleaseAbstract( content=abst_plain, mimetype="text/plain", lang="en")) else: mime = "text/plain" abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) if orig: abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) # indicates that fulltext probably isn't english either if lang == 'en': lang = None # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # arxiv: comments, categories, etc extra_arxiv['base_id'] = base_id extra['superceded'] = True extra['arxiv'] = extra_arxiv versions = [] for version in metadata.find_all('version'): arxiv_id = base_id + version['version'] release_date = version.date.string.strip() release_date = datetime.datetime.strptime( release_date, "%a, %d %b %Y %H:%M:%S %Z").date() # TODO: source_type? versions.append( fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, #original_title version=version['version'], release_type=release_type, release_stage='submitted', release_date=release_date.isoformat(), release_year=release_date.year, ext_ids=fatcat_openapi_client.ReleaseExtIds( arxiv=arxiv_id, ), number=number, language=lang, license_slug=license_slug, abstracts=abstracts, contribs=contribs, extra=extra.copy(), )) # TODO: assert that versions are actually in order? assert versions versions[-1].extra.pop('superceded') # only apply DOI to most recent version (HACK) if doi: versions[-1].ext_ids.doi = doi if len(versions) > 1: versions[-1].release_stage = "accepted" return versions
def parse_record(self, a): medline = a.MedlineCitation # PubmedData isn't required by DTD, but seems to always be present pubmed = a.PubmedData extra = dict() extra_pubmed = dict() identifiers = pubmed.ArticleIdList pmid = medline.PMID.string.strip() doi = identifiers.find("ArticleId", IdType="doi") if doi and doi.string: doi = clean_doi(doi.string) else: doi = None pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: pmcid = clean_pmcid(pmcid.string.strip().upper()) release_type = None pub_types = [] for pub_type in medline.Article.PublicationTypeList.find_all( "PublicationType"): pub_types.append(pub_type.string) if pub_type.string in PUBMED_RELEASE_TYPE_MAP: release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] break if pub_types: extra_pubmed['pub_types'] = pub_types if medline.Article.PublicationTypeList.find( string="Retraction of Publication"): release_type = "retraction" retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") if retraction_of: if retraction_of.RefSource: extra_pubmed[ 'retraction_of_raw'] = retraction_of.RefSource.string if retraction_of.PMID: extra_pubmed[ 'retraction_of_pmid'] = retraction_of.PMID.string # everything in medline is published release_stage = "published" if medline.Article.PublicationTypeList.find( string="Corrected and Republished Article"): release_stage = "updated" if medline.Article.PublicationTypeList.find( string="Retraction of Publication"): release_stage = "retraction" withdrawn_status = None if medline.Article.PublicationTypeList.find( string="Retracted Publication"): withdrawn_status = "retracted" elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"): withdrawn_status = "concern" pages = medline.find('MedlinePgn') if pages: pages = pages.string title = medline.Article.ArticleTitle.get_text() # always present if title: title = title.replace('\n', ' ') if title.endswith('.'): title = title[:-1] # this hides some "special" titles, but the vast majority are # translations; translations don't always include the original_title if title.startswith('[') and title.endswith(']'): title = title[1:-1] else: # will filter out later title = None original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.get_text() or None original_title = original_title.replace('\n', ' ') if original_title and original_title.endswith('.'): original_title = original_title[:-1] if original_title and not title: # if we only have an "original" title, but not translated/english # title, sub in the original title so the entity can be created title = original_title original_title = None # TODO: happening in alpha order, not handling multi-language well. language = medline.Article.Language if language: language = language.get_text() if language in ("und", "un"): # "undetermined" language = None else: language = LANG_MAP_MARC.get(language) if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): warnings.warn("MISSING MARC LANG: {}".format( medline.Article.Language.string)) ### Journal/Issue Metadata # MedlineJournalInfo is always present issnl = None container_id = None container_name = None container_extra = dict() mji = medline.MedlineJournalInfo if mji.find("Country"): country_name = mji.Country.string.strip() country_code = COUNTRY_NAME_MAP.get(country_name) if country_code: container_extra['country'] = country_code elif country_name: container_extra['country_name'] = country_name if mji.find("ISSNLinking"): issnl = mji.ISSNLinking.string journal = medline.Article.Journal issnp = journal.find("ISSN", IssnType="Print") if issnp: container_extra['issnp'] = issnp.string if not issnl: issnl = self.issn2issnl(issnp) if issnl: container_id = self.lookup_issnl(issnl) pub_date = medline.Article.find('ArticleDate') if not pub_date: pub_date = journal.PubDate if not pub_date: pub_date = journal.JournalIssue.PubDate release_date = None release_year = None if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): try: release_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], int(pub_date.Day.string)) release_date = release_date.isoformat() except ValueError as ve: print("bad date, skipping: {}".format(ve), file=sys.stderr) release_date = None elif pub_date.MedlineDate: medline_date = pub_date.MedlineDate.string.strip() if len(medline_date) >= 4 and medline_date[:4].isdigit(): release_year = int(medline_date[:4]) if release_year < 1300 or release_year > 2040: print( "bad medline year, skipping: {}".format(release_year), file=sys.stderr) release_year = None else: print("unparsable medline date, skipping: {}".format( medline_date), file=sys.stderr) if journal.find("Title"): container_name = journal.Title.get_text() if (container_id is None and self.create_containers and (issnl is not None) and container_name): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country ce = fatcat_openapi_client.ContainerEntity( name=container_name, container_type='journal', #NOTE: publisher not included issnl=issnl, extra=(container_extra or None)) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id ji = journal.JournalIssue volume = None if ji.find("Volume"): volume = ji.Volume.string issue = None if ji.find("Issue"): issue = ji.Issue.string ### Abstracts # "All abstracts are in English" abstracts = [] primary_abstract = medline.find("Abstract") if primary_abstract and primary_abstract.AbstractText.get( 'NlmCategory'): joined = "\n".join([ m.get_text() for m in primary_abstract.find_all("AbstractText") ]) abst = fatcat_openapi_client.ReleaseAbstract( content=joined, mimetype="text/plain", lang="en", ) if abst.content: abstracts.append(abst) elif primary_abstract: for abstract in primary_abstract.find_all("AbstractText"): abst = fatcat_openapi_client.ReleaseAbstract( content=abstract.get_text().strip(), mimetype="text/plain", lang="en", ) if abst.content: abstracts.append(abst) if abstract.find('math'): abst = fatcat_openapi_client.ReleaseAbstract( # strip the <AbstractText> tags content=str(abstract)[14:-15], mimetype="application/mathml+xml", lang="en", ) if abst.content: abstracts.append(abst) other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: lang = "en" if other.get('Language'): lang = LANG_MAP_MARC.get(other['Language']) abst = fatcat_openapi_client.ReleaseAbstract( content=other.AbstractText.get_text().strip(), mimetype="text/plain", lang=lang, ) if abst.content: abstracts.append(abst) if not abstracts: abstracts = None ### Contribs contribs = [] if medline.AuthorList: for author in medline.AuthorList.find_all("Author"): creator_id = None given_name = None surname = None raw_name = None if author.ForeName: given_name = author.ForeName.get_text().replace('\n', ' ') if author.LastName: surname = author.LastName.get_text().replace('\n', ' ') if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.get_text( ): raw_name = author.CollectiveName.get_text().replace( '\n', ' ') contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: # needs re-formatting from, eg, "0000000179841889" orcid = orcid.string if orcid.startswith("http://orcid.org/"): orcid = orcid.replace("http://orcid.org/", "") elif orcid.startswith("https://orcid.org/"): orcid = orcid.replace("https://orcid.org/", "") elif not '-' in orcid: orcid = "{}-{}-{}-{}".format( orcid[0:4], orcid[4:8], orcid[8:12], orcid[12:16], ) creator_id = self.lookup_orcid(orcid) contrib_extra['orcid'] = orcid affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: raw_affiliation = affiliations[0].get_text().replace( '\n', ' ') if len(affiliations) > 1: contrib_extra['more_affiliations'] = [ ra.get_text().replace('\n', ' ') for ra in affiliations[1:] ] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True contribs.append( fatcat_openapi_client.ReleaseContrib( raw_name=raw_name, given_name=given_name, surname=surname, role="author", raw_affiliation=raw_affiliation, creator_id=creator_id, extra=contrib_extra, )) if medline.AuthorList['CompleteYN'] == 'N': contribs.append( fatcat_openapi_client.ReleaseContrib(raw_name="et al.")) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i if not contribs: contribs = None ### References refs = [] if pubmed.ReferenceList: # note that Reference always exists within a ReferenceList, but # that there may be multiple ReferenceList (eg, sometimes one per # Reference) for ref in pubmed.find_all('Reference'): ref_extra = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: ref_doi = clean_doi(ref_doi.string) ref_pmid = ref.find("ArticleId", IdType="pubmed") if ref_pmid: ref_pmid = clean_pmid(ref_pmid.string) ref_release_id = None if ref_doi: ref_extra['doi'] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) if ref_pmid: ref_extra['pmid'] = ref_pmid if self.lookup_refs: ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: ref_extra['unstructured'] = ref_raw.get_text() if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( target_release_id=ref_release_id, extra=ref_extra, )) if not refs: refs = None # extra: # translation_of # aliases # container_name # group-title # pubmed: retraction refs if extra_pubmed: extra['pubmed'] = extra_pubmed if not extra: extra = None title = clean(title) if not title: return None re = fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, original_title=clean(original_title), release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, withdrawn_status=withdrawn_status, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, pmid=pmid, pmcid=pmcid, #isbn13 # never in Article ), volume=volume, issue=issue, pages=pages, #publisher # not included? language=language, #license_slug # not in MEDLINE abstracts=abstracts, contribs=contribs, refs=refs, container_id=container_id, extra=extra, ) return re
def test_access_redirect_fallback(client: Any, mocker: Any) -> None: with open("tests/files/elastic_fulltext_get.json") as f: elastic_resp = json.loads(f.read()) es_raw = mocker.patch( "elasticsearch.connection.Urllib3HttpConnection.perform_request" ) es_raw.side_effect = [ (200, {}, json.dumps(elastic_resp)), (200, {}, json.dumps(elastic_resp)), (200, {}, json.dumps(elastic_resp)), (200, {}, json.dumps(elastic_resp)), ] fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work") fatcat_get_work_raw.side_effect = [ fatcat_openapi_client.WorkEntity( state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww", ) ] * 4 fatcat_get_work_releases_raw = mocker.patch( "fatcat_openapi_client.DefaultApi.get_work_releases" ) fatcat_get_work_releases_raw.side_effect = [ [ fatcat_openapi_client.ReleaseEntity( ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", ext_ids=fatcat_openapi_client.ReleaseExtIds(), ), ] ] * 4 fatcat_get_release_raw = mocker.patch( "fatcat_openapi_client.DefaultApi.get_release" ) fatcat_get_release_raw.side_effect = [ fatcat_openapi_client.ReleaseEntity( state="active", ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", ext_ids=fatcat_openapi_client.ReleaseExtIds(), files=[ fatcat_openapi_client.FileEntity( ident="ffffffffffffffffffffffffff", urls=[ fatcat_openapi_client.FileUrl( rel="web", url="https://blarg.example.com", ), fatcat_openapi_client.FileUrl( rel="webarchive", url="https://web.archive.org/web/12345/https://example.com", ), fatcat_openapi_client.FileUrl( rel="archive", url="https://archive.org/download/some/thing.pdf", ), ], ), ], ) ] * 4 # redirects should work after API lookup, for both wayback and archive.org rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com", allow_redirects=False, ) assert rv.status_code == 302 assert ( rv.headers["Location"] == "https://web.archive.org/web/12345id_/https://example.com" ) rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf", allow_redirects=False, ) assert rv.status_code == 302 assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf" # wrong URLs should still not work, but display a page with helpful links rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, ) assert rv.status_code == 404 assert b"Access Location Not Found" in rv.content assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf", allow_redirects=False, ) assert rv.status_code == 404 assert b"Access Location Not Found" in rv.content assert b"archive.org/download/some/thing.else.pdf" in rv.content
def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ bibjson { abstract (string, optional), author (Array[bibjson.author], optional), identifier (Array[bibjson.identifier]), journal (bibjson.journal, optional), keywords (Array[string], optional), link (Array[bibjson.link], optional), month (string, optional), subject (Array[bibjson.subject], optional), title (string), year (string, optional) } bibjson.journal { country (string, optional), end_page (string, optional), language (Array[string], optional), license (Array[bibjson.journal.license], optional), number (string, optional), publisher (string, optional), start_page (string, optional), title (string, optional), volume (string, optional) } """ if not obj or not isinstance(obj, dict) or "bibjson" not in obj: self.counts["skip-empty"] += 1 return None bibjson = obj["bibjson"] title = clean_str(bibjson.get("title"), force_xml=True) if not title: self.counts["skip-title"] += 1 return False container_name = clean_str(bibjson["journal"]["title"]) container_id = None # NOTE: 'issns' not documented in API schema for issn in bibjson["journal"]["issns"]: issnl = self.issn2issnl(issn) if issnl: container_id = self.lookup_issnl(issnl) if container_id: # don't store container_name when we have an exact match container_name = None break volume = clean_str(bibjson["journal"].get("volume")) # NOTE: this schema seems to use "number" as "issue number" issue = clean_str(bibjson["journal"].get("number")) publisher = clean_str(bibjson["journal"].get("publisher")) try: release_year: Optional[int] = int(bibjson.get("year")) except (TypeError, ValueError): release_year = None release_month = parse_month(clean_str(bibjson.get("month"))) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_month = None release_year = None license_slug = self.doaj_license_slug( bibjson["journal"].get("license")) country = parse_country_name(bibjson["journal"].get("country")) language = None for raw in bibjson["journal"].get("language") or []: language = parse_lang_name(raw) if language: break # pages # NOTE: error in API docs? seems like start_page not under 'journal' object start_page = clean_str( bibjson["journal"].get("start_page")) or clean_str( bibjson.get("start_page")) end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str( bibjson.get("end_page")) pages: Optional[str] = None if start_page and end_page: pages = f"{start_page}-{end_page}" elif start_page: pages = start_page doaj_article_id = obj["id"].lower() ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id) abstracts = self.doaj_abstracts(bibjson) or [] contribs = self.doaj_contribs(bibjson.get("author") or []) or [] # DOAJ-specific extra doaj_extra: Dict[str, Any] = dict() if bibjson.get("subject"): doaj_extra["subject"] = bibjson.get("subject") if bibjson.get("keywords"): doaj_extra["keywords"] = [ k for k in [clean_str(s) for s in bibjson.get("keywords")] if k ] # generic extra extra: Dict[str, Any] = dict() if country: extra["country"] = country if not container_id and container_name: extra["container_name"] = container_name if release_year and release_month: # TODO: schema migration extra["release_month"] = release_month if doaj_extra: extra["doaj"] = doaj_extra re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type="article-journal", release_stage="published", title=title, release_year=release_year, # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs or None, volume=volume, issue=issue, pages=pages, language=language, abstracts=abstracts or None, extra=extra or None, license_slug=license_slug, ) re = self.biblio_hacks(re) # TODO: filter out some of these by publishers which are known to # register DOIs. eg, PLOS, maybe others return re
def parse_record(self, obj): """ bibjson { abstract (string, optional), author (Array[bibjson.author], optional), identifier (Array[bibjson.identifier]), journal (bibjson.journal, optional), keywords (Array[string], optional), link (Array[bibjson.link], optional), month (string, optional), subject (Array[bibjson.subject], optional), title (string), year (string, optional) } bibjson.journal { country (string, optional), end_page (string, optional), language (Array[string], optional), license (Array[bibjson.journal.license], optional), number (string, optional), publisher (string, optional), start_page (string, optional), title (string, optional), volume (string, optional) } """ if not obj or not isinstance(obj, dict) or not 'bibjson' in obj: self.counts['skip-empty'] += 1 return None bibjson = obj['bibjson'] title = clean_str(bibjson.get('title'), force_xml=True) if not title: self.counts['skip-title'] += 1 return False container_name = clean_str(bibjson['journal']['title']) container_id = None # NOTE: 'issns' not documented in API schema for issn in bibjson['journal']['issns']: issnl = self.issn2issnl(issn) if issnl: container_id = self.lookup_issnl(self.issn2issnl(issn)) if container_id: # don't store container_name when we have an exact match container_name = None break volume = clean_str(bibjson['journal'].get('volume')) # NOTE: this schema seems to use "number" as "issue number" issue = clean_str(bibjson['journal'].get('number')) publisher = clean_str(bibjson['journal'].get('publisher')) try: release_year = int(bibjson.get('year')) except (TypeError, ValueError): release_year = None release_month = parse_month(clean_str(bibjson.get('month'))) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_month = None release_year = None license_slug = self.doaj_license_slug( bibjson['journal'].get('license')) country = parse_country_name(bibjson['journal'].get('country')) language = None for raw in bibjson['journal'].get('language') or []: language = parse_lang_name(raw) if language: break # pages # NOTE: error in API docs? seems like start_page not under 'journal' object start_page = clean_str( bibjson['journal'].get('start_page')) or clean_str( bibjson.get('start_page')) end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str( bibjson.get('end_page')) pages: Optional[str] = None if start_page and end_page: pages = f"{start_page}-{end_page}" elif start_page: pages = start_page doaj_article_id = obj['id'].lower() ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) abstracts = self.doaj_abstracts(bibjson) contribs = self.doaj_contribs(bibjson.get('author') or []) # DOAJ-specific extra doaj_extra = dict() if bibjson.get('subject'): doaj_extra['subject'] = bibjson.get('subject') if bibjson.get('keywords'): doaj_extra['keywords'] = [ k for k in [clean_str(s) for s in bibjson.get('keywords')] if k ] # generic extra extra = dict() if country: extra['country'] = country if not container_id and container_name: extra['container_name'] = container_name if release_year and release_month: # TODO: schema migration extra['release_month'] = release_month if doaj_extra: extra['doaj'] = doaj_extra if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type='article-journal', release_stage='published', title=title, release_year=release_year, #release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, volume=volume, issue=issue, pages=pages, language=language, abstracts=abstracts, extra=extra, license_slug=license_slug, ) re = self.biblio_hacks(re) return re
def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get('type') in (None, 'journal', 'proceedings', 'standard-series', 'report-series', 'book-series', 'book-set', 'book-track', 'proceedings-series'): self.counts['skip-release-type'] += 1 return None # Do require the 'title' keys to exist, as release entities do if (not 'title' in obj) or (not obj['title']): self.counts['skip-blank-title'] += 1 return None release_type = self.map_release_type(obj['type']) # contribs def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None if 'ORCID' in am.keys(): creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) # Sorry humans :( if am.get('given') and am.get('family'): raw_name = "{} {}".format(am['given'], am['family']) elif am.get('family'): raw_name = am['family'] else: # TODO: can end up empty raw_name = am.get('name') or am.get('given') extra = dict() if ctype == "author": index = i else: index = None raw_affiliation = None if am.get('affiliation'): if len(am.get('affiliation')) > 0: raw_affiliation = am.get('affiliation')[0]['name'] if len(am.get('affiliation')) > 1: # note: affiliation => more_affiliations extra['more_affiliations'] = [ clean(a['name']) for a in am.get('affiliation')[1:] ] if am.get('sequence') and am.get('sequence') != "additional": extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, given_name=clean(am.get('given')), surname=clean(am.get('family')), raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs contribs = do_contribs(obj.get('author', []), "author") contribs.extend(do_contribs(obj.get('editor', []), "editor")) contribs.extend(do_contribs(obj.get('translator', []), "translator")) # container issn = obj.get('ISSN', [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) publisher = clean(obj.get('publisher')) container_name = obj.get('container-title') if container_name: container_name = clean(container_name[0], force_xml=True) if not container_name: container_name = None if (container_id is None and self.create_containers and (issnl is not None) and container_name): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), name=container_name) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id # license slug license_slug = None license_extra = [] for lic in obj.get('license', []): if lic['content-version'] not in ('vor', 'unspecified'): continue slug = lookup_license_slug(lic['URL']) if slug: license_slug = slug if 'start' in lic: lic['start'] = lic['start']['date-time'] license_extra.append(lic) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except (TypeError, ValueError): year = None ref_extra = dict() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') key = key.replace(obj['DOI'].upper(), '') ref_container_name = rm.get('volume-title') if not ref_container_name: ref_container_name = rm.get('journal-title') elif rm.get('journal-title'): ref_extra['journal-title'] = rm['journal-title'] if rm.get('DOI'): ref_extra['doi'] = rm.get('DOI').lower() author = clean(rm.get('author')) if author: ref_extra['authors'] = [author] for k in ('editor', 'edition', 'authority', 'version', 'genre', 'url', 'event', 'issue', 'volume', 'date', 'accessed_date', 'issued', 'page', 'medium', 'collection_title', 'chapter_number', 'unstructured', 'series-title', 'volume-title'): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=i, # doing lookups would be a second import pass target_release_id=None, key=key, year=year, container_name=clean(ref_container_name), title=clean(rm.get('article-title')), locator=clean(rm.get('first-page')), # TODO: just dump JSON somewhere here? extra=ref_extra)) # abstracts abstracts = [] abstract = clean(obj.get('abstract')) if abstract and len(abstract) > 10: abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="application/xml+jats", content=abstract)) # extra fields extra = dict() extra_crossref = dict() # top-level extra keys if not container_id: if obj.get('container-title'): extra['container_name'] = container_name for key in ('group-title'): val = obj.get(key) if val: if type(val) == list: val = val[0] if type(val) == str: val = clean(val) if val: extra[key] = clean(val) else: extra[key] = val # crossref-nested extra keys for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): val = obj.get(key) if val: if type(val) == str: extra_crossref[key] = clean(val) else: extra_crossref[key] = val if license_extra: extra_crossref['license'] = license_extra if len(obj['title']) > 1: aliases = [clean(t) for t in obj['title'][1:]] aliases = [t for t in aliases if t] if aliases: extra['aliases'] = aliases # ISBN isbn13 = None for raw in obj.get('ISBN', []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status if obj['type'] in ('journal-article', 'conference-proceeding', 'book', 'dissertation', 'book-chapter'): release_stage = "published" else: # unknown release_stage = None # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) # filter out unreasonably huge releases if len(abstracts) > 100: self.counts['skip-huge-abstracts'] += 1 return None if len(contribs) > 2000: self.counts['skip-huge-contribs'] += 1 return None if len(refs) > 5000: self.counts['skip-huge-refs'] += 1 return None # release date parsing is amazingly complex raw_date = obj['issued']['date-parts'][0] if not raw_date or not raw_date[0]: # got some NoneType, even though at least year is supposed to be set release_year = None release_date = None elif len(raw_date) == 3: release_year = raw_date[0] release_date = datetime.date(year=raw_date[0], month=raw_date[1], day=raw_date[2]) else: # sometimes only the year is included, not the full date release_year = raw_date[0] release_date = None original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) title = None if obj.get('title'): title = clean(obj.get('title')[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character self.counts['skip-blank-title'] += 1 return None subtitle = None if obj.get('subtitle'): subtitle = clean(obj.get('subtitle')[0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: extra['crossref'] = extra_crossref if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, title=title, subtitle=subtitle, original_title=original_title, release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=obj['DOI'].lower(), pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], isbn13=isbn13, core=extids['core_id'], arxiv=extids['arxiv_id'], jstor=extids['jstor_id'], ), volume=clean(obj.get('volume')), issue=clean(obj.get('issue')), pages=clean(obj.get('page')), language=clean(obj.get('language')), license_slug=license_slug, extra=extra, abstracts=abstracts, contribs=contribs, refs=refs, ) return re
def parse_record(self, xml_elem): """ - title => may contain <i>, <sub>, <sup>, <tt> - journal (abbrev?) - volume, pages, number (number -> issue) - publisher - year => for conferences, year of conference not of publication - month - crossref (from inproceedings to specific proceedings volume) - booktitle => for inproceedings, this is the name of conference or workshop. acronym. - isbn """ dblp_key = xml_elem.get('key') if not dblp_key: self.counts['skip-empty-key'] += 1 return False dblp_key_type = dblp_key.split('/')[0] # dblp_prefix may be used for container lookup dblp_prefix = None if dblp_key_type in ('journals', 'conf'): dblp_prefix = '/'.join(dblp_key.split('/')[:2]) elif dblp_key_type in ('series', 'reference', 'tr', 'books'): dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) publtype = xml_elem.get('publtype') or None dblp_type = xml_elem.name if dblp_type not in self.ELEMENT_TYPES: self.counts[f'skip-dblp-type:{dblp_type}'] += 1 if dblp_key_type in ('homepages', 'persons', 'dblpnote'): self.counts['skip-key-type'] += 1 return False if dblp_key.startswith('journals/corr/'): self.counts['skip-arxiv-corr'] += 1 return False title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) if not title: self.counts['skip-title'] += 1 return False if title.endswith('.'): title = title[:-1] release_type = None release_stage = 'published' withdrawn_status = None # primary releae_type detection: type of XML element, then prefix of key for granularity if dblp_type == 'article': release_type = 'article' if dblp_key_type == 'journals' and publtype != 'informal': release_type = 'article-journal' elif dblp_key_type == 'tr': release_type = 'report' elif title.startswith("Review:"): release_type = 'review' elif dblp_type == 'inproceedings': release_type = 'paper-conference' elif dblp_type == 'book': release_type = 'book' elif dblp_type == 'incollection': # XXX: part vs. chapter? release_type = 'chapter' elif dblp_type == 'data': release_type = 'dataset' elif dblp_type in ('mastersthesis', 'phdthesis'): release_type = 'thesis' # overrides/extensions of the above if publtype == 'informal': # for conferences, seems to indicate peer-review status # for journals, seems to indicate things like book reviews; split out above pass elif publtype == 'encyclopedia': release_type = 'entry-encyclopedia' elif publtype == 'edited': # XXX: article? release_type = 'editorial' elif publtype == 'data': release_type = 'dataset' elif publtype == 'data': release_type = 'dataset' elif publtype == 'software': release_type = 'software' elif publtype == 'widthdrawn': withdrawn_status = 'widthdrawn' elif publtype == 'survey': # XXX: flag as a review/survey article? pass #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) container_name = None booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) series = clean_str(xml_elem.series and xml_elem.series.text) if xml_elem.journal: container_name = clean_str(xml_elem.journal.text) container_id = None if dblp_prefix: container_id = self.lookup_dblp_prefix(dblp_prefix) # note: we will skip later if couldn't find prefix publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text) volume = clean_str(xml_elem.volume and xml_elem.volume.text) issue = clean_str(xml_elem.number and xml_elem.number.text) pages = clean_str(xml_elem.pages and xml_elem.pages.text) release_year = clean_str(xml_elem.year and xml_elem.year.text) if release_year and release_year.isdigit(): release_year = int(release_year) else: release_year = None release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text)) isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text) part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_month = None release_year = None contribs = self.dblp_contribs(xml_elem or []) ext_ids = self.dblp_ext_ids(xml_elem, dblp_key) if isbn: ext_ids.isbn13 = isbn if ext_ids.doi: self.counts['has-doi'] += 1 # dblp-specific extra dblp_extra = dict(type=dblp_type) note = clean_str(xml_elem.note and xml_elem.note.text) if note and not 'base-search.net' in note: dblp_extra['note'] = note if part_of_key: dblp_extra['part_of_key'] = part_of_key # generic extra extra = dict() if not container_id and container_name: extra['container_name'] = container_name if series and (dblp_key_type == 'series' or dblp_type == 'book'): extra['series-title'] = series elif series: dblp_extra['series'] = series if booktitle and dblp_key_type == 'series': extra['container-title'] = booktitle elif booktitle and dblp_key_type == 'conf': extra['event'] = booktitle elif booktitle: dblp_extra['booktitle'] = booktitle if release_year and release_month: # TODO: release_month schema migration extra['release_month'] = release_month if dblp_extra: extra['dblp'] = dblp_extra if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, release_stage=release_stage, withdrawn_status=withdrawn_status, title=title, release_year=release_year, #release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, volume=volume, issue=issue, pages=pages, extra=extra, ) re = self.biblio_hacks(re) if self.dump_json_mode: re_dict = entity_to_dict(re, api_client=self.api.api_client) re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem) re_dict['_dblp_prefix'] = dblp_prefix print(json.dumps(re_dict, sort_keys=True)) return False if not re.container_id: self.counts["skip-dblp-container-missing"] += 1 return False return re