Beispiel #1
0
    def doaj_ext_ids(
            self, identifiers: List[dict],
            doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
        """
        bibjson.identifier {
            id (string),
            type (string)
        }
        """

        assert doaj_article_id.isalnum() and len(doaj_article_id) == 32

        doi: Optional[str] = None
        pmid: Optional[str] = None
        pmcid: Optional[str] = None
        for id_obj in identifiers:
            if not id_obj.get("id"):
                continue
            if id_obj["type"].lower() == "doi":
                doi = clean_doi(id_obj["id"])
            elif id_obj["type"].lower() == "pmid":
                pmid = clean_pmid(id_obj["id"])
            elif id_obj["type"].lower() == "pmcid":
                pmcid = clean_pmcid(id_obj["id"])

        return fatcat_openapi_client.ReleaseExtIds(
            doaj=doaj_article_id,
            doi=doi,
            pmid=pmid,
            pmcid=pmcid,
        )
Beispiel #2
0
    def dblp_ext_ids(self, xml_elem: Any,
                     dblp_key: str) -> fatcat_openapi_client.ReleaseExtIds:
        """
        Takes a full XML object and returns external identifiers.

        Currently these can be arixv identifiers, DOI, or wikidata QID

        - ee (electronic edition; often DOI?)
            => in some cases a "local" URL
            => publisher URL; often DOI
            => type attr
        - url
            => dblp internal link to table-of-contents
        """

        doi: Optional[str] = None
        wikidata_qid: Optional[str] = None
        arxiv_id: Optional[str] = None
        hdl: Optional[str] = None
        for ee in xml_elem.find_all("ee"):
            url = ee.text
            # convert DOI-like domains, which mostly have DOIs anyways
            if "://doi.acm.org/" in url:
                url = url.replace("://doi.acm.org/", "://doi.org/")
            elif "://doi.ieeecomputersociety.org/" in url:
                url = url.replace("://doi.ieeecomputersociety.org/",
                                  "://doi.org/")

            if "doi.org/10." in url and not doi:
                doi = clean_doi(url)
            elif "wikidata.org/entity/Q" in url and not wikidata_qid:
                wikidata_qid = clean_wikidata_qid(url)
            elif "://arxiv.org/abs/" in url and not arxiv_id:
                arxiv_id = (url.replace("http://",
                                        "").replace("https://", "").replace(
                                            "arxiv.org/abs/", ""))
                arxiv_id = clean_arxiv_id(arxiv_id)
            elif "://hdl.handle.net" in url and not hdl:
                hdl = clean_hdl(url)

        return fatcat_openapi_client.ReleaseExtIds(
            dblp=dblp_key,
            doi=doi,
            wikidata_qid=wikidata_qid,
            arxiv=arxiv_id,
            hdl=hdl,
        )
Beispiel #3
0
    def parse_grobid_json(self, obj):

        if not obj.get('title'):
            return None

        extra_grobid = dict()

        abstract = obj.get('abstract')
        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(
                abstract) > 10:
            abobj = fatcat_openapi_client.ReleaseAbstract(
                mimetype="text/plain", content=clean(obj.get('abstract')))
            abstracts = [abobj]
        else:
            abstracts = None

        contribs = []
        for i, a in enumerate(obj.get('authors', [])):
            contribs.append(
                fatcat_openapi_client.ReleaseContrib(
                    index=i,
                    raw_name=clean(a['name']),
                    given_name=clean(a.get('given_name')),
                    surname=clean(a.get('surname')),
                    role="author",
                    extra=None))

        refs = []
        for raw in obj.get('citations', []):
            cite_extra = dict()
            year = None
            if raw.get('date'):
                try:
                    year = int(raw['date'].strip()[:4])
                except:
                    pass
            for key in ('volume', 'url', 'issue', 'publisher'):
                if raw.get(key):
                    cite_extra[key] = clean(raw[key])
            if raw.get('authors'):
                cite_extra['authors'] = [
                    clean(a['name']) for a in raw['authors']
                ]

            if not cite_extra:
                cite_extra = None
            refs.append(
                fatcat_openapi_client.ReleaseRef(key=clean(raw.get('id')),
                                                 year=year,
                                                 title=clean(raw['title']),
                                                 extra=cite_extra))

        release_date = None
        release_year = None
        if obj.get('date'):
            # only returns year, ever?
            release_year = int(obj['date'][:4])

        extra = dict()
        if obj.get('doi'):
            extra['doi'] = obj['doi']
        if obj['journal'] and obj['journal'].get('name'):
            extra['container_name'] = clean(obj['journal']['name'])

        # TODO: ISSN/eISSN handling? or just journal name lookup?

        if extra_grobid:
            extra['grobid'] = extra_grobid
        if self.longtail_oa:
            extra['longtail_oa'] = True
        if not extra:
            extra = None

        title = clean(obj['title'], force_xml=True)
        if not title or len(title) < 2:
            return None

        re = fatcat_openapi_client.ReleaseEntity(
            title=title,
            release_type="article-journal",
            release_date=release_date,
            release_year=release_year,
            contribs=contribs,
            refs=refs,
            publisher=clean(obj['journal'].get('publisher')),
            volume=clean(obj['journal'].get('volume')),
            issue=clean(obj['journal'].get('issue')),
            abstracts=abstracts,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(),
            extra=extra)
        return re
Beispiel #4
0
    def parse_record(self, obj):
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # Ways to be out of scope (provisionally)
        # journal-issue and journal-volume map to None, but allowed for now
        if obj.get('type') in (None, 'journal', 'proceedings',
                               'standard-series', 'report-series',
                               'book-series', 'book-set', 'book-track',
                               'proceedings-series'):
            return None

        # Do require the 'title' keys to exsit, as release entities do
        if (not 'title' in obj) or (not obj['title']):
            return None

        release_type = self.map_release_type(obj['type'])

        # contribs
        def do_contribs(obj_list, ctype):
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if 'ORCID' in am.keys():
                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
                # Sorry humans :(
                if am.get('given') and am.get('family'):
                    raw_name = "{} {}".format(am['given'], am['family'])
                elif am.get('family'):
                    raw_name = am['family']
                else:
                    # TODO: can end up empty
                    raw_name = am.get('name') or am.get('given')
                extra = dict()
                if ctype == "author":
                    index = i
                else:
                    index = None
                raw_affiliation = None
                if am.get('affiliation'):
                    if len(am.get('affiliation')) > 0:
                        raw_affiliation = am.get('affiliation')[0]['name']
                    if len(am.get('affiliation')) > 1:
                        # note: affiliation => more_affiliations
                        extra['more_affiliations'] = [
                            clean(a['name']) for a in am.get('affiliation')[1:]
                        ]
                if am.get('sequence') and am.get('sequence') != "additional":
                    extra['seq'] = clean(am.get('sequence'))
                if not extra:
                    extra = None
                assert ctype in ("author", "editor", "translator")
                raw_name = clean(raw_name)
                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(
                        creator_id=creator_id,
                        index=index,
                        raw_name=raw_name,
                        given_name=clean(am.get('given')),
                        surname=clean(am.get('family')),
                        raw_affiliation=clean(raw_affiliation),
                        role=ctype,
                        extra=extra))
            return contribs

        contribs = do_contribs(obj.get('author', []), "author")
        contribs.extend(do_contribs(obj.get('editor', []), "editor"))
        contribs.extend(do_contribs(obj.get('translator', []), "translator"))

        # container
        issn = obj.get('ISSN', [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = clean(obj.get('publisher'))

        container_name = obj.get('container-title')
        if container_name:
            container_name = clean(container_name[0], force_xml=True)
        if not container_name:
            container_name = None
        if (container_id is None and self.create_containers
                and (issnl is not None) and container_name):
            ce = fatcat_openapi_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=container_name)
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id

        # license slug
        license_slug = None
        license_extra = []
        for l in obj.get('license', []):
            if l['content-version'] not in ('vor', 'unspecified'):
                continue
            slug = lookup_license_slug(l['URL'])
            if slug:
                license_slug = slug
            if 'start' in l:
                l['start'] = l['start']['date-time']
            license_extra.append(l)

        # references
        refs = []
        for i, rm in enumerate(obj.get('reference', [])):
            try:
                year = int(rm.get('year'))
                # TODO: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year > 2025 or year < 100:
                    year = None
            except:
                year = None
            ref_extra = dict()
            key = rm.get('key')
            if key and key.startswith(obj['DOI'].upper()):
                key = key.replace(obj['DOI'].upper() + "-", '')
                key = key.replace(obj['DOI'].upper(), '')
            ref_container_name = rm.get('volume-title')
            if not ref_container_name:
                ref_container_name = rm.get('journal-title')
            elif rm.get('journal-title'):
                ref_extra['journal-title'] = rm['journal-title']
            if rm.get('DOI'):
                ref_extra['doi'] = rm.get('DOI').lower()
            author = clean(rm.get('author'))
            if author:
                ref_extra['authors'] = [author]
            for k in ('editor', 'edition', 'authority', 'version', 'genre',
                      'url', 'event', 'issue', 'volume', 'date',
                      'accessed_date', 'issued', 'page', 'medium',
                      'collection_title', 'chapter_number', 'unstructured',
                      'series-title', 'volume-title'):
                if clean(rm.get(k)):
                    ref_extra[k] = clean(rm[k])
            if not ref_extra:
                ref_extra = None
            refs.append(
                fatcat_openapi_client.ReleaseRef(
                    index=i,
                    # doing lookups would be a second import pass
                    target_release_id=None,
                    key=key,
                    year=year,
                    container_name=clean(ref_container_name),
                    title=clean(rm.get('article-title')),
                    locator=clean(rm.get('first-page')),
                    # TODO: just dump JSON somewhere here?
                    extra=ref_extra))

        # abstracts
        abstracts = []
        abstract = clean(obj.get('abstract'))
        if abstract and len(abstract) > 10:
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(
                    mimetype="application/xml+jats", content=abstract))

        # extra fields
        extra = dict()
        extra_crossref = dict()
        # top-level extra keys
        if not container_id:
            if obj.get('container-title'):
                extra['container_name'] = container_name
        for key in ('group-title'):
            val = obj.get(key)
            if val:
                if type(val) == list:
                    val = val[0]
                if type(val) == str:
                    val = clean(val)
                    if val:
                        extra[key] = clean(val)
                else:
                    extra[key] = val
        # crossref-nested extra keys
        for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
            val = obj.get(key)
            if val:
                if type(val) == str:
                    extra_crossref[key] = clean(val)
                else:
                    extra_crossref[key] = val
        if license_extra:
            extra_crossref['license'] = license_extra

        if len(obj['title']) > 1:
            aliases = [clean(t) for t in obj['title'][1:]]
            aliases = [t for t in aliases if t]
            if aliases:
                extra['aliases'] = aliases

        # ISBN
        isbn13 = None
        for raw in obj.get('ISBN', []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
                           'dissertation', 'book-chapter'):
            release_stage = "published"
        else:
            # unknown
            release_stage = None

        # external identifiers
        extids = self.lookup_ext_ids(doi=obj['DOI'].lower())

        # filter out unreasonably huge releases
        if len(abstracts) > 100:
            return None
        if len(refs) > 2000:
            return None
        if len(refs) > 5000:
            return None

        # release date parsing is amazingly complex
        raw_date = obj['issued']['date-parts'][0]
        if not raw_date or not raw_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_year = None
            release_date = None
        elif len(raw_date) == 3:
            release_year = raw_date[0]
            release_date = datetime.date(year=raw_date[0],
                                         month=raw_date[1],
                                         day=raw_date[2])
        else:
            # sometimes only the year is included, not the full date
            release_year = raw_date[0]
            release_date = None

        original_title = None
        if obj.get('original-title'):
            original_title = clean(obj.get('original-title')[0],
                                   force_xml=True)

        title = None
        if obj.get('title'):
            title = clean(obj.get('title')[0], force_xml=True)
            if not title or len(title) <= 1:
                # title can't be just a single character
                return None

        subtitle = None
        if obj.get('subtitle'):
            subtitle = clean(obj.get('subtitle')[0], force_xml=True)
            if not subtitle or len(subtitle) <= 1:
                # subtitle can't be just a single character
                return None

        if extra_crossref:
            extra['crossref'] = extra_crossref
        if not extra:
            extra = None

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            title=title,
            subtitle=subtitle,
            original_title=original_title,
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            publisher=publisher,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=obj['DOI'].lower(),
                pmid=extids['pmid'],
                pmcid=extids['pmcid'],
                wikidata_qid=extids['wikidata_qid'],
                isbn13=isbn13,
                core=extids['core_id'],
                arxiv=extids['arxiv_id'],
                jstor=extids['jstor_id'],
            ),
            volume=clean(obj.get('volume')),
            issue=clean(obj.get('issue')),
            pages=clean(obj.get('page')),
            language=clean(obj.get('language')),
            license_slug=license_slug,
            extra=extra,
            abstracts=abstracts,
            contribs=contribs,
            refs=refs,
        )
        return re
Beispiel #5
0
    def parse_record(self, a):

        medline = a.MedlineCitation
        # PubmedData isn't required by DTD, but seems to always be present
        pubmed = a.PubmedData
        extra = dict()
        extra_pubmed = dict()

        identifiers = pubmed.ArticleIdList
        pmid = medline.PMID.string.strip()
        doi = identifiers.find("ArticleId", IdType="doi")
        if doi and doi.string:
            doi = clean_doi(doi.string)
        else:
            doi = None

        pmcid = identifiers.find("ArticleId", IdType="pmc")
        if pmcid:
            pmcid = clean_pmcid(pmcid.string.strip().upper())

        release_type = None
        pub_types = []
        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
            pub_types.append(pub_type.string)
            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
                break
        if pub_types:
            extra_pubmed['pub_types'] = pub_types
        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
            release_type = "retraction"
            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
            if retraction_of:
                if retraction_of.RefSource:
                    extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
                if retraction_of.PMID:
                    extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string

        # everything in medline is published
        release_stage = "published"
        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
            release_stage = "updated"
        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
            release_stage = "retraction"

        withdrawn_status = None
        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
            withdrawn_status = "retracted"
        elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
            withdrawn_status = "concern"

        pages = medline.find('MedlinePgn')
        if pages:
            pages = pages.string

        title = medline.Article.ArticleTitle.string # always present
        if title:
            if title.endswith('.'):
                title = title[:-1]
            # this hides some "special" titles, but the vast majority are
            # translations; translations don't always include the original_title
            if title.startswith('[') and title.endswith(']'):
                title = title[1:-1]
        else:
            # will filter out later
            title = None

        original_title = medline.Article.find("VernacularTitle", recurse=False)
        if original_title:
            original_title = original_title.string or None
            if original_title and original_title.endswith('.'):
                original_title = original_title[:-1]

        # TODO: happening in alpha order, not handling multi-language well.
        language = medline.Article.Language
        if language:
            language = language.string
            if language in ("und", "un"):
                # "undetermined"
                language = None
            else:
                language = LANG_MAP_MARC.get(language)
                if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))

        ### Journal/Issue Metadata
        # MedlineJournalInfo is always present
        issnl = None
        container_id = None
        container_name = None
        container_extra = dict()
        mji = medline.MedlineJournalInfo
        if mji.find("Country"):
            country_name = mji.Country.string.strip()
            country_code = COUNTRY_NAME_MAP.get(country_name)
            if country_code:
                container_extra['country'] = country_code
            elif country_name:
                container_extra['country_name'] = country_name
        if mji.find("ISSNLinking"):
            issnl = mji.ISSNLinking.string

        journal = medline.Article.Journal
        issnp = journal.find("ISSN", IssnType="Print")
        if issnp:
            container_extra['issnp'] = issnp.string
        if not issnl:
            issnll = self.issn2issnl(issnp)

        if issnl:
            container_id = self.lookup_issnl(issnl)

        pub_date = medline.Article.find('ArticleDate')
        if not pub_date:
            pub_date = journal.PubDate
        if not pub_date:
            pub_date = journal.JournalIssue.PubDate
        release_date = None
        release_year = None
        if pub_date.Year:
            release_year = int(pub_date.Year.string)
            if pub_date.find("Day") and pub_date.find("Month"):
                try:
                    release_date = datetime.date(
                        release_year,
                        MONTH_ABBR_MAP[pub_date.Month.string],
                        int(pub_date.Day.string))
                    release_date = release_date.isoformat()
                except ValueError as ve:
                    print("bad date, skipping: {}".format(ve), file=sys.stderr)
                    release_date = None
        elif pub_date.MedlineDate:
            medline_date = pub_date.MedlineDate.string.strip()
            if len(medline_date) >= 4 and medline_date[:4].isdigit():
                release_year = int(medline_date[:4])
                if release_year < 1300 or release_year > 2040:
                    print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
                    release_year = None
            else:
                print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)

        if journal.find("Title"):
            container_name = journal.Title.string

        if (container_id is None and self.create_containers and (issnl is not None)
                and container_name):
            # name, type, publisher, issnl
            # extra: issnp, issne, original_name, languages, country
            ce = fatcat_openapi_client.ContainerEntity(
                name=container_name,
                container_type='journal',
                #NOTE: publisher not included
                issnl=issnl,
                extra=(container_extra or None))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id
       
        ji = journal.JournalIssue
        volume = None
        if ji.find("Volume"):
            volume = ji.Volume.string
        issue = None
        if ji.find("Issue"):
            issue = ji.Issue.string

        ### Abstracts
        # "All abstracts are in English"
        abstracts = []
        primary_abstract = medline.find("Abstract")
        if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
            joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
            abst = fatcat_openapi_client.ReleaseAbstract(
                content=joined,
                mimetype="text/plain",
                lang="en",
            )
            if abst.content:
                abstracts.append(abst)
        elif primary_abstract:
            for abstract in primary_abstract.find_all("AbstractText"):
                abst = fatcat_openapi_client.ReleaseAbstract(
                    content=abstract.get_text().strip(),
                    mimetype="text/plain",
                    lang="en",
                )
                if abst.content:
                    abstracts.append(abst)
                if abstract.find('math'):
                    abst = fatcat_openapi_client.ReleaseAbstract(
                        # strip the <AbstractText> tags
                        content=str(abstract)[14:-15],
                        mimetype="application/mathml+xml",
                        lang="en",
                    )
                    if abst.content:
                        abstracts.append(abst)
        other_abstracts = medline.find_all("OtherAbstract")
        for other in other_abstracts:
            lang = "en"
            if other.get('Language'):
                lang = LANG_MAP_MARC.get(other['Language'])
            abst = fatcat_openapi_client.ReleaseAbstract(
                content=other.AbstractText.get_text().strip(),
                mimetype="text/plain",
                lang=lang,
            )
            if abst.content:
                abstracts.append(abst)
        if not abstracts:
            abstracts = None

        ### Contribs
        contribs = []
        if medline.AuthorList:
            for author in medline.AuthorList.find_all("Author"):
                creator_id = None
                given_name = None
                surname = None
                raw_name = None
                if author.ForeName:
                    given_name = author.ForeName.string
                if author.LastName:
                    surname = author.LastName.string
                if given_name and surname:
                    raw_name = "{} {}".format(given_name, surname)
                elif surname:
                    raw_name = surname
                if not raw_name and author.CollectiveName and author.CollectiveName.string:
                    raw_name = author.CollectiveName.string
                contrib_extra = dict()
                orcid = author.find("Identifier", Source="ORCID")
                if orcid:
                    # needs re-formatting from, eg, "0000000179841889"
                    orcid = orcid.string
                    if orcid.startswith("http://orcid.org/"):
                        orcid = orcid.replace("http://orcid.org/", "")
                    elif orcid.startswith("https://orcid.org/"):
                        orcid = orcid.replace("https://orcid.org/", "")
                    elif not '-' in orcid:
                        orcid = "{}-{}-{}-{}".format(
                            orcid[0:4],
                            orcid[4:8],
                            orcid[8:12],
                            orcid[12:16],
                        )
                    creator_id = self.lookup_orcid(orcid)
                    contrib_extra['orcid'] = orcid
                affiliations = author.find_all("Affiliation")
                raw_affiliation = None
                if affiliations:
                    raw_affiliation = affiliations[0].string
                    if len(affiliations) > 1:
                        contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
                if author.find("EqualContrib"):
                    # TODO: schema for this?
                    contrib_extra['equal'] = True
                contribs.append(fatcat_openapi_client.ReleaseContrib(
                    raw_name=raw_name,
                    given_name=given_name,
                    surname=surname,
                    role="author",
                    raw_affiliation=raw_affiliation,
                    creator_id=creator_id,
                    extra=contrib_extra,
                ))

            if medline.AuthorList['CompleteYN'] == 'N':
                contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i
        if not contribs:
            contribs = None

        ### References
        refs = []
        if pubmed.ReferenceList:
            for ref in pubmed.ReferenceList.find_all('Reference'):
                ref_extra = dict()
                ref_doi = ref.find("ArticleId", IdType="doi")
                if ref_doi:
                    ref_doi = clean_doi(ref_doi.string)
                ref_pmid = ref.find("ArticleId", IdType="pubmed")
                if ref_pmid:
                    ref_pmid = clean_pmid(ref_pmid.string)
                ref_release_id = None
                if ref_doi:
                    ref_extra['doi'] = ref_doi
                    if self.lookup_refs:
                        ref_release_id = self.lookup_doi(ref_doi)
                if ref_pmid:
                    ref_extra['pmid'] = ref_pmid
                    if self.lookup_refs:
                        ref_release_id = self.lookup_pmid(ref_pmid)
                ref_raw = ref.Citation
                if ref_raw:
                    ref_extra['unstructured'] = ref_raw.string
                if not ref_extra:
                    ref_extra = None
                refs.append(fatcat_openapi_client.ReleaseRef(
                    target_release_id=ref_release_id,
                    extra=ref_extra,
                ))
        if not refs:
            refs = None

        # extra:
        #   translation_of
        #   aliases
        #   container_name
        #   group-title
        #   pubmed: retraction refs
        if extra_pubmed:
            extra['pubmed'] = extra_pubmed
        if not extra:
            extra = None

        title = clean(title)
        if not title:
            return None

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            title=title,
            original_title=clean(original_title),
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            withdrawn_status=withdrawn_status,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                pmid=pmid,
                pmcid=pmcid,
                #isbn13     # never in Article
            ),
            volume=volume,
            issue=issue,
            pages=pages,
            #publisher  # not included?
            language=language,
            #license_slug   # not in MEDLINE
            abstracts=abstracts,
            contribs=contribs,
            refs=refs,
            container_id=container_id,
            extra=extra,
        )
        return re
Beispiel #6
0
    def parse_record(self, article):

        journal_meta = article.front.find("journal-meta")
        article_meta = article.front.find("article-meta")

        extra = dict()
        extra_jstor = dict()

        release_type = JSTOR_TYPE_MAP.get(article['article-type'])
        title = article_meta.find("article-title")
        if title and title.string:
            title = title.string.strip()
        elif title and not title.string:
            title = None

        if not title and release_type.startswith(
                'review') and article_meta.product.source:
            title = "Review: {}".format(article_meta.product.source.string)

        if not title:
            return None

        if title.endswith('.'):
            title = title[:-1]

        if "[Abstract]" in title:
            # TODO: strip the "[Abstract]" bit?
            release_type = "abstract"
        elif "[Editorial" in title:
            release_type = "editorial"
        elif "[Letter" in title:
            release_type = "letter"
        elif "[Poem" in title or "[Photograph" in title:
            release_type = None

        if title.startswith("[") and title.endswith("]"):
            # strip brackets if that is all that is there (eg, translation or non-english)
            title = title[1:-1]

        # JSTOR journal-id
        journal_ids = [j.string for j in journal_meta.find_all('journal-id')]
        if journal_ids:
            extra_jstor['journal_ids'] = journal_ids

        journal_title = journal_meta.find("journal-title").string
        publisher = journal_meta.find("publisher-name").string
        issn = journal_meta.find("issn")
        if issn:
            issn = issn.string
            if len(issn) == 8:
                issn = "{}-{}".format(issn[0:4], issn[4:8])
            else:
                assert len(issn) == 9

        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)

        # create container if it doesn't exist
        if (container_id is None and self.create_containers
                and (issnl is not None) and journal_title):
            ce = fatcat_openapi_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=clean(journal_title, force_xml=True))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id

        doi = article_meta.find("article-id", {"pub-id-type": "doi"})
        if doi:
            doi = doi.string.lower().strip()

        jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
        if jstor_id:
            jstor_id = jstor_id.string.strip()
        if not jstor_id and doi:
            assert doi.startswith('10.2307/')
            jstor_id = doi.replace('10.2307/', '')
        assert jstor_id and int(jstor_id)

        contribs = []
        cgroup = article_meta.find("contrib-group")
        if cgroup:
            for c in cgroup.find_all("contrib"):
                given = c.find("given-names")
                if given:
                    given = clean(given.string)
                surname = c.find("surname")
                if surname:
                    surname = clean(surname.string)
                raw_name = c.find("string-name")
                if raw_name:
                    raw_name = clean(raw_name.string)

                if not raw_name:
                    if given and surname:
                        raw_name = "{} {}".format(given, surname)
                    elif surname:
                        raw_name = surname

                role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author'))
                if not role and c.get('contrib-type'):
                    sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(
                        c['contrib-type']))
                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(
                        role=role,
                        raw_name=raw_name,
                        given_name=given,
                        surname=surname,
                    ))

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i

        release_year = None
        release_date = None
        pub_date = article_meta.find('pub-date')
        if pub_date and pub_date.year:
            release_year = int(pub_date.year.string)
            if pub_date.month and pub_date.day:
                release_date = datetime.date(release_year,
                                             int(pub_date.month.string),
                                             int(pub_date.day.string))
                if release_date.day == 1 and release_date.month == 1:
                    # suspect jan 1st dates get set by JSTOR when actual
                    # date not known (citation needed), so drop them
                    release_date = None

        volume = None
        if article_meta.volume:
            volume = article_meta.volume.string or None

        issue = None
        if article_meta.issue:
            issue = article_meta.issue.string or None

        pages = None
        if article_meta.find("page-range"):
            pages = article_meta.find("page-range").string
        elif article_meta.fpage:
            pages = article_meta.fpage.string

        language = None
        cm = article_meta.find("custom-meta")
        if cm.find("meta-name").string == "lang":
            language = cm.find("meta-value").string.split()[0]
            language = LANG_MAP_MARC.get(language)
            if not language:
                warnings.warn("MISSING MARC LANG: {}".format(
                    cm.find("meta-value").string))

        # JSTOR issue-id
        if article_meta.find('issue-id'):
            issue_id = clean(article_meta.find('issue-id').string)
            if issue_id:
                extra_jstor['issue_id'] = issue_id

        # everything in JSTOR is published
        release_stage = "published"

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   pubmed: retraction refs
        if extra_jstor:
            extra['jstor'] = extra_jstor
        if not extra:
            extra = None

        re = fatcat_openapi_client.ReleaseEntity(
            #work_id
            title=title,
            #original_title
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                jstor=jstor_id,
            ),
            volume=volume,
            issue=issue,
            pages=pages,
            publisher=publisher,
            language=language,
            #license_slug

            # content, mimetype, lang
            #abstracts=abstracts,
            contribs=contribs,

            # key, year, container_name, title, locator
            # extra: volume, authors, issue, publisher, identifiers
            #refs=refs,

            #   name, type, publisher, issnl
            #   extra: issnp, issne, original_name, languages, country
            container_id=container_id,
            extra=extra,
        )
        return re
Beispiel #7
0
    def parse_record(self, obj):
        """
        Mapping datacite JSON to ReleaseEntity.
        """
        if not obj or not isinstance(obj, dict):
            return None
        if 'attributes' not in obj:
            return None

        attributes = obj['attributes']
        doi = clean_doi(attributes.get('doi', '').lower())

        if not doi:
            print('skipping record without a DOI', file=sys.stderr)
            return

        if not isascii(doi):
            print('[{}] skipping non-ascii doi for now'.format(doi))
            return None

        creators = attributes.get('creators', []) or []
        contributors = attributes.get('contributors', []) or [
        ]  # Much fewer than creators.

        contribs = self.parse_datacite_creators(
            creators, doi=doi) + self.parse_datacite_creators(
                contributors, role=None, set_index=False, doi=doi)

        # Title, may come with "attributes.titles[].titleType", like
        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
        titles = attributes.get('titles', []) or []
        title, original_language_title, subtitle = parse_datacite_titles(
            titles)

        if title is None:
            print('[{}] skipping record w/o title: {}'.format(doi, obj),
                  file=sys.stderr)
            return False

        title = clean(title)
        if not title:
            print('[{}] skipping record w/o title: {}'.format(doi, obj),
                  file=sys.stderr)
            return False

        if not subtitle:
            subtitle = None
        else:
            subtitle = clean(subtitle)

        # Dates. A few internal dates (registered, created, updated) and
        # published (0..2554). We try to work with typed date list, in
        # "attributes.dates[].dateType", values: "Accepted", "Available"
        # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
        # "Updated", "Valid".
        release_date, release_month, release_year = parse_datacite_dates(
            attributes.get('dates', []))

        # Some records do not use the "dates" field (e.g. micropub), but:
        # "attributes.published" or "attributes.publicationYear"
        if not any((release_date, release_month, release_year)):
            release_date, release_month, release_year = parse_single_date(
                attributes.get('publicationYear'))
            if not any((release_date, release_month, release_year)):
                release_date, release_month, release_year = parse_single_date(
                    attributes.get('published'))

        if not any((release_date, release_month, release_year)):
            print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)

        # Start with clear stages, e.g. published. TODO(martin): we could
        # probably infer a bit more from the relations, e.g.
        # "IsPreviousVersionOf" or "IsNewVersionOf".
        release_stage = 'published'

        # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
        # we might want something else than 'published'. See also:
        # https://support.datacite.org/docs/doi-states.

        # Publisher. A few NA values. A few bogus values.
        publisher = attributes.get('publisher')

        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
            publisher = None
            release_stage = None
        if publisher is not None and len(publisher) > 80:
            # Arbitrary magic value max length. TODO(martin): better heuristic,
            # but factored out; first we have to log misses. Example:
            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE /
            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt
            # werden"
            publisher = None

        if publisher:
            publisher = clean(publisher)

        # Container. For the moment, only ISSN as container.
        container_id = None
        container_name = None

        container = attributes.get('container', {}) or {}
        if container.get('type') in CONTAINER_TYPE_MAP.keys():
            container_type = CONTAINER_TYPE_MAP.get(container['type'])
            if container.get('identifier') and container.get(
                    'identifierType') == 'ISSN':
                issn = container.get('identifier')
                if len(issn) == 8:
                    issn = issn[:4] + "-" + issn[4:]
                issnl = self.issn2issnl(issn)
                if issnl is not None:
                    container_id = self.lookup_issnl(issnl)

                    if container_id is None and container.get('title'):
                        container_name = container.get('title')
                        if isinstance(container_name, list):
                            if len(container_name) > 0:
                                print('[{}] too many container titles: {}'.
                                      format(doi, len(container_name)))
                                container_name = container_name[0]
                        assert isinstance(container_name, str)
                        ce = fatcat_openapi_client.ContainerEntity(
                            issnl=issnl,
                            container_type=container_type,
                            name=container_name,
                        )
                        ce_edit = self.create_container(ce)
                        container_id = ce_edit.ident
                        self._issnl_id_map[issnl] = container_id
                else:
                    # TODO(martin): factor this out into a testable function.
                    # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013
                    container_name = container.get('title')
                    if isinstance(container_name, list):
                        if len(container_name) > 0:
                            print('[{}] too many container titles: {}'.format(
                                doi, len(container_name)))
                            container_name = container_name[0]

        # Exception: https://www.micropublication.org/, see: !MR24.
        if container_id is None and container_name is None:
            if publisher and publisher.lower().startswith('micropublication'):
                container_name = publisher

        # Volume and issue.
        volume = container.get('volume')
        issue = container.get('issue')

        if volume:
            volume = clean(volume)

        if issue:
            issue = clean(issue)

        # Pages.
        pages = None

        first_page = container.get('firstPage')
        last_page = container.get('lastPage')

        if first_page and last_page:
            try:
                _ = int(first_page) < int(last_page)
                pages = '{}-{}'.format(first_page, last_page)
            except ValueError as err:
                # TODO(martin): This is more debug than info.
                # print('[{}] {}'.format(doi, err), file=sys.stderr)
                pass

        if not pages and first_page:
            pages = first_page

        # License.
        license_slug = None
        license_extra = []

        for l in attributes.get('rightsList', []):
            slug = lookup_license_slug(l.get('rightsUri'))
            if slug:
                license_slug = slug
            license_extra.append(l)

        # Release type. Try to determine the release type from a variety of
        # types supplied in datacite. The "attributes.types.resourceType" is
        # uncontrolled (170000+ unique values, from "null", "Dataset" to
        # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
        # flows in 2009") citeproc may be the closest, but not always supplied.
        # Order lookup roughly by completeness of mapping.
        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex',
                         'resourceTypeGeneral'):
            value = attributes.get('types', {}).get(typeType)
            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
            if release_type is not None:
                break

        if release_type is None:
            print("[{}] no mapped type: {}".format(doi, value),
                  file=sys.stderr)

        # release_type exception: Global Biodiversity Information Facility
        # publishes highly interesting datasets, but titles are mostly the same
        # ("GBIF Occurrence Download" or "Occurrence Download"); set
        # release_type to "stub" (CSL/FC).
        if publisher == 'The Global Biodiversity Information Facility':
            release_type = 'stub'

        # release_type exception: lots of "Experimental Crystal Structure Determination"
        if publisher == 'Cambridge Crystallographic Data Centre':
            release_type = 'entry'

        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
        if title.lower().startswith('additional file'):
            release_type = 'stub'

        # Language values are varied ("ger", "es", "English", "ENG", "en-us",
        # "other", ...). Try to crush it with langcodes: "It may sound to you
        # like langcodes solves a pretty boring problem. At one level, that's
        # right. Sometimes you have a boring problem, and it's great when a
        # library solves it for you." -- TODO(martin): We need more of these.
        language = None

        value = attributes.get('language', '') or ''
        try:
            language = pycountry.languages.lookup(value).alpha_2
        except (LookupError, AttributeError) as err:
            pass
            # TODO(martin): Print this on debug level, only.
            # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr)

        # Abstracts appear in "attributes.descriptions[].descriptionType", some
        # of the observed values: "Methods", "TechnicalInfo",
        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The
        # "Other" fields might contain references or related articles (with
        # DOI). TODO(martin): maybe try to parse out some of those refs.
        abstracts = []
        descs = attributes.get('descriptions', []) or []
        for desc in descs:
            if not desc.get('descriptionType') == 'Abstract':
                continue

            # Description maybe a string or list.
            text = desc.get('description', '')
            if not text:
                continue
            if isinstance(text, list):
                try:
                    text = "\n".join(text)
                except TypeError as err:
                    continue  # Bail out, if it is not a list of strings.

            # Limit length.
            if len(text) < 10:
                continue
            if len(text) > MAX_ABSTRACT_LENGTH:
                text = text[:MAX_ABSTRACT_LENGTH] + " [...]"

            # Detect language. This is fuzzy and may be removed, if too unreliable.
            lang = None
            try:
                lang = langdetect.detect(text)
            except (langdetect.lang_detect_exception.LangDetectException,
                    TypeError) as err:
                print('[{}] language detection failed with {} on {}'.format(
                    doi, err, text),
                      file=sys.stderr)
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(
                    mimetype="text/plain",
                    content=clean(text),
                    lang=lang,
                ))

        # References and relations. Datacite include many relation types in
        # "attributes.relatedIdentifiers[].relationType", e.g.
        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf",
        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion",
        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart",
        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf",
        # "IsDerivedFrom", "IsSourceOf".
        #
        # For the moment, we only care about References.
        refs, ref_index = [], 0

        relIds = attributes.get('relatedIdentifiers', []) or []
        for rel in relIds:
            if not rel.get('relationType', '') in ('References', 'Cites'):
                continue
            ref_extra = dict()
            if rel.get('relatedIdentifierType', '') == 'DOI':
                ref_extra['doi'] = rel.get('relatedIdentifier')
            if not ref_extra:
                ref_extra = None
            refs.append(
                fatcat_openapi_client.ReleaseRef(
                    index=ref_index,
                    extra=ref_extra,
                ))
            ref_index += 1

        # More specific release_type via 'Reviews' relationsship.
        for rel in relIds:
            if rel.get('relatedIdentifierType', '') != 'Reviews':
                continue
            release_type = 'review'

        # Extra information.
        extra_datacite = dict()

        if license_extra:
            extra_datacite['license'] = license_extra
        if attributes.get('subjects'):
            extra_datacite['subjects'] = attributes['subjects']

        # Include version information.
        metadata_version = attributes.get('metadataVersion') or ''

        if metadata_version:
            extra_datacite['metadataVersion'] = metadata_version

        # Include resource types.
        types = attributes.get('types', {}) or {}
        resource_type = types.get('resourceType', '') or ''
        resource_type_general = types.get('resourceTypeGeneral', '') or ''

        if resource_type and resource_type.lower(
        ) not in UNKNOWN_MARKERS_LOWER:
            extra_datacite['resourceType'] = resource_type
        if resource_type_general and resource_type_general.lower(
        ) not in UNKNOWN_MARKERS_LOWER:
            extra_datacite['resourceTypeGeneral'] = resource_type_general

        # Include certain relations from relatedIdentifiers. Keeping the
        # original structure of data here, which is a list of dicts, with
        # relation type, identifer and identifier type (mostly).
        relations = []
        for rel in relIds:
            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
                                           'IsVariantFormOf', 'IsSupplementTo',
                                           'HasVersion', 'IsMetadataFor',
                                           'IsNewVersionOf', 'IsIdenticalTo',
                                           'IsVersionOf', 'IsDerivedFrom',
                                           'IsSourceOf'):
                relations.append(rel)

        if relations:
            extra_datacite['relations'] = relations

        extra = dict()

        # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0",
        # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555",
        # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st
        # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
        # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
        # "10161", "10010691", "10780", # "Presentación"
        version = attributes.get('version')

        # top-level extra keys
        if not container_id and container_name:
            extra['container_name'] = container_name

        # Always include datacite key, even if value is empty (dict).
        extra['datacite'] = extra_datacite

        # Preparation for a schema update.
        if release_month:
            extra['release_month'] = release_month

        extids = self.lookup_ext_ids(doi=doi)

        # Assemble release.
        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            release_type=release_type,
            release_stage=release_stage,
            title=title,
            subtitle=subtitle,
            original_title=original_language_title,
            release_year=release_year,
            release_date=release_date,
            publisher=publisher,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                pmid=extids['pmid'],
                pmcid=extids['pmcid'],
                wikidata_qid=extids['wikidata_qid'],
                core=extids['core_id'],
                arxiv=extids['arxiv_id'],
                jstor=extids['jstor_id'],
            ),
            contribs=contribs,
            volume=volume,
            issue=issue,
            pages=pages,
            language=language,
            abstracts=abstracts,
            refs=refs,
            extra=extra,
            license_slug=license_slug,
            version=version,
        )
        return re
Beispiel #8
0
    def parse_record(self, record):
        """
        record is a beautiful soup object
        returns a ReleaseEntity, or None

        In JALC metadata, both English and Japanese records are given for most
        fields.
        """

        extra = dict()
        extra_jalc = dict()

        titles = record.find_all("title")
        if not titles:
            return None
        title = titles[0].string.strip()
        original_title = None
        if title.endswith('.'):
            title = title[:-1]
        if len(titles) > 1:
            original_title = titles[1].string.strip()
            if original_title.endswith('.'):
                original_title = original_title[:-1]

        doi = None
        if record.doi:
            doi = record.doi.string.lower().strip()
            if doi.startswith('http://dx.doi.org/'):
                doi = doi.replace('http://dx.doi.org/', '')
            elif doi.startswith('https://dx.doi.org/'):
                doi = doi.replace('https://dx.doi.org/', '')
            elif doi.startswith('http://doi.org/'):
                doi = doi.replace('http://doi.org/', '')
            elif doi.startswith('https://doi.org/'):
                doi = doi.replace('https://doi.org/', '')
            if not (doi.startswith('10.') and '/' in doi):
                sys.stderr.write("bogus JALC DOI: {}\n".format(doi))
                doi = None
        if not doi:
            return None

        people = record.find_all("Person")
        contribs = parse_jalc_persons(people)

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i

        release_year = None
        release_date = None
        date = record.date or None
        if date:
            date = date.string
            if len(date) is 10:
                release_date = datetime.datetime.strptime(
                    date['completed-date'], DATE_FMT).date()
                release_year = release_date.year
                release_date = release_date.isoformat()
            elif len(date) is 4 and date.isdigit():
                release_year = int(date)

        pages = None
        if record.startingPage:
            pages = record.startingPage.string
            if record.endingPage:
                pages = "{}-{}".format(pages, record.endingPage.string)
        volume = None
        if record.volume:
            volume = record.volume.string
        issue = None
        if record.number:
            # note: number/issue transform
            issue = record.number.string

        # container
        issn = None
        issn_list = record.find_all("issn")
        if issn_list:
            # if we wanted the other ISSNs, would also need to uniq the list.
            # But we only need one to lookup ISSN-L/container
            issn = issn_list[0].string
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)

        publisher = None
        container_name = None
        container_extra = dict()

        if record.publicationName:
            pubs = [
                p.string.strip() for p in record.find_all("publicationName")
                if p.string
            ]
            pubs = [clean(p) for p in pubs if p]
            assert (pubs)
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            if len(pubs) > 1 and is_cjk(pubs[0]):
                # eng/jpn ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            container_name = clean(pubs[0])
            if len(pubs) > 1:
                container_extra['original_name'] = clean(pubs[1])

        if record.publisher:
            pubs = [
                p.string.strip() for p in record.find_all("publisher")
                if p.string
            ]
            pubs = [p for p in pubs if p]
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            if len(pubs) > 1 and is_cjk(pubs[0]):
                # ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            if pubs:
                publisher = clean(pubs[0])
                if len(pubs) > 1:
                    container_extra['publisher_aliases'] = pubs[1:]

        if (container_id is None and self.create_containers
                and (issnl is not None) and container_name):
            # name, type, publisher, issnl
            # extra: issnp, issne, original_name, languages, country
            container_extra['country'] = 'jp'
            container_extra['languages'] = ['ja']
            ce = fatcat_openapi_client.ContainerEntity(
                name=container_name,
                container_type='journal',
                publisher=publisher,
                issnl=issnl,
                extra=(container_extra or None))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            # short-cut future imports in same batch
            self._issnl_id_map[issnl] = container_id

        # the vast majority of works are in japanese
        # TODO: any indication when *not* in japanese?
        lang = "ja"

        # reasonable default for this collection
        release_type = "article-journal"

        # external identifiers
        extids = self.lookup_ext_ids(doi=doi)

        # extra:
        #   translation_of
        #   aliases
        #   container_name
        #   group-title
        # always put at least an empty dict here to indicate the DOI registrar
        # (informally)
        extra['jalc'] = extra_jalc

        title = clean(title)
        if not title:
            return None

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            title=title,
            original_title=clean(original_title),
            release_type="article-journal",
            release_stage='published',
            release_date=release_date,
            release_year=release_year,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                pmid=extids['pmid'],
                pmcid=extids['pmcid'],
                wikidata_qid=extids['wikidata_qid'],
                core=extids['core_id'],
                arxiv=extids['arxiv_id'],
                jstor=extids['jstor_id'],
            ),
            volume=volume,
            issue=issue,
            pages=pages,
            publisher=publisher,
            language=lang,
            #license_slug
            container_id=container_id,
            contribs=contribs,
            extra=extra,
        )
        return re
Beispiel #9
0
    def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]:

        if not record:
            return None
        metadata = record.arXivRaw
        if not metadata:
            return None
        extra: Dict[str, Any] = dict()
        extra_arxiv: Dict[str, Any] = dict()

        # don't know!
        release_type = "article"

        base_id = metadata.id.string
        doi = None
        if metadata.doi and metadata.doi.string:
            doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
            if doi and not (doi.startswith("10.") and "/" in doi
                            and doi.split("/")[1]):
                sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                doi = None
        title = latex_to_text(metadata.title.get_text().replace("\n", " "))
        authors = parse_arxiv_authors(metadata.authors.get_text().replace(
            "\n", " "))
        contribs = [
            fatcat_openapi_client.ReleaseContrib(index=i,
                                                 raw_name=a,
                                                 role="author")
            for i, a in enumerate(authors)
        ]

        lang: Optional[str] = "en"  # the vast majority in english
        if metadata.comments and metadata.comments.get_text():
            comments = metadata.comments.get_text().replace("\n", " ").strip()
            extra_arxiv["comments"] = comments
            if "in french" in comments.lower():
                lang = "fr"
            elif "in spanish" in comments.lower():
                lang = "es"
            elif "in portuguese" in comments.lower():
                lang = "pt"
            elif "in hindi" in comments.lower():
                lang = "hi"
            elif "in japanese" in comments.lower():
                lang = "ja"
            elif "in german" in comments.lower():
                lang = "de"
            elif "simplified chinese" in comments.lower():
                lang = "zh"
            elif "in russian" in comments.lower():
                lang = "ru"
            # more languages?

        number = None
        if metadata.find("journal-ref") and metadata.find(
                "journal-ref").get_text():
            journal_ref = metadata.find("journal-ref").get_text().replace(
                "\n", " ").strip()
            extra_arxiv["journal_ref"] = journal_ref
            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(
            ):
                release_type = "paper-conference"
        if metadata.find("report-no") and metadata.find("report-no").string:
            number = metadata.find("report-no").string.strip()
            # at least some people plop extra metadata in here. hrmf!
            if "ISSN " in number or "ISBN " in number or len(
                    number.split()) > 2:
                extra_arxiv["report-no"] = number
                number = None
            else:
                release_type = "report"
        if metadata.find("acm-class") and metadata.find("acm-class").string:
            extra_arxiv["acm_class"] = metadata.find(
                "acm-class").string.strip()
        if metadata.categories and metadata.categories.get_text():
            extra_arxiv["categories"] = metadata.categories.get_text().split()
        license_slug = None
        if metadata.license and metadata.license.get_text():
            license_slug = lookup_license_slug(metadata.license.get_text())
        abstracts = None
        if metadata.abstract:
            # TODO: test for this multi-abstract code path
            abstracts = []
            abst = metadata.abstract.get_text().strip()
            orig = None
            if "-----" in abst:
                both = abst.split("-----")
                abst = both[0].strip()
                orig = both[1].strip()
            if "$" in abst or "{" in abst:
                mime = "application/x-latex"
                abst_plain = latex_to_text(abst)
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(
                        content=abst_plain, mimetype="text/plain", lang="en"))
            else:
                mime = "text/plain"
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(content=abst,
                                                      mimetype=mime,
                                                      lang="en"))
            if orig:
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(content=orig,
                                                          mimetype=mime))
                # indicates that fulltext probably isn't english either
                if lang == "en":
                    lang = None

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   arxiv: comments, categories, etc
        extra_arxiv["base_id"] = base_id
        extra["superceded"] = True
        extra["arxiv"] = extra_arxiv

        versions = []
        for version in metadata.find_all("version"):
            arxiv_id = base_id + version["version"]
            release_date = version.date.string.strip()
            release_date = datetime.datetime.strptime(
                release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
            # TODO: source_type?
            versions.append(
                ReleaseEntity(
                    work_id=None,
                    title=title,
                    # original_title
                    version=version["version"],
                    release_type=release_type,
                    release_stage="submitted",
                    release_date=release_date.isoformat(),
                    release_year=release_date.year,
                    ext_ids=fatcat_openapi_client.ReleaseExtIds(
                        arxiv=arxiv_id, ),
                    number=number,
                    language=lang,
                    license_slug=license_slug,
                    abstracts=abstracts,
                    contribs=contribs,
                    extra=extra.copy(),
                ))
        # TODO: assert that versions are actually in order?
        assert versions

        versions[-1].extra.pop("superceded")

        # only apply DOI to most recent version (HACK)
        if doi:
            versions[-1].ext_ids.doi = doi
            if len(versions) > 1:
                versions[-1].release_stage = "accepted"
        return versions
Beispiel #10
0
    def parse_record(self, obj):
        """
        Mapping datacite JSON to ReleaseEntity.
        """
        if not obj or not isinstance(obj, dict):
            return None
        if 'attributes' not in obj:
            return None

        attributes = obj['attributes']
        doi = clean_doi(attributes.get('doi', '').lower())

        if not doi:
            print('skipping record without a DOI', file=sys.stderr)
            return

        if not str.isascii(doi):
            print('[{}] skipping non-ascii doi for now'.format(doi))
            return None

        creators = attributes.get('creators', []) or []
        contributors = attributes.get('contributors', []) or [
        ]  # Much fewer than creators.

        contribs = self.parse_datacite_creators(creators, doi=doi)

        # Beside creators, we have contributors in datacite. Sample:
        # ContactPerson, DataCollector, DataCurator, DataManager, Distributor,
        # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader,
        # ProjectMember, RelatedPerson, ResearchGroup, Researcher,
        # RightsHolder, Sponsor, Supervisor
        #
        # Datacite schema:
        # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32
        # -- could be used as a form of controlled vocab?
        #
        # Currently (07/2020) in release_contrib:
        #
        # select count(*), role from release_contrib group by role;
        #    count   |    role
        # -----------+------------
        #  500269665 | author
        #    4386563 | editor
        #      17871 | translator
        #   10870584 |
        # (4 rows)
        #
        # Related: https://guide.fatcat.wiki/entity_release.html -- role
        # (string, of a set): the type of contribution, from a controlled
        # vocabulary. TODO: vocabulary needs review.
        contribs_extra_contributors = self.parse_datacite_creators(
            contributors, set_index=False, doi=doi)

        # Unfortunately, creators and contributors might overlap, refs GH59.
        for cc in contribs_extra_contributors:
            if contributor_list_contains_contributor(contribs, cc):
                continue
            contribs.append(cc)

        # Title, may come with "attributes.titles[].titleType", like
        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
        titles = attributes.get('titles', []) or []
        title, original_language_title, subtitle = parse_datacite_titles(
            titles)

        if title is None:
            print('[{}] skipping record w/o title: {}'.format(doi, obj),
                  file=sys.stderr)
            return False

        title = clean(title)
        if not title:
            print('[{}] skipping record w/o title: {}'.format(doi, obj),
                  file=sys.stderr)
            return False

        # check for blocklisted "spam", e.g. "FULL MOVIE"
        for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
            seen = set()
            for token in rule.get("tokens", []):
                if token in title.lower():
                    seen.add(token)
            if len(seen) >= rule.get("min"):
                print("[{}] skipping spammy title: {}".format(doi, obj),
                      file=sys.stderr)
                return False

        if not subtitle:
            subtitle = None
        else:
            subtitle = clean(subtitle)

        # Dates. A few internal dates (registered, created, updated) and
        # published (0..2554). We try to work with typed date list, in
        # "attributes.dates[].dateType", values: "Accepted", "Available"
        # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
        # "Updated", "Valid".
        release_date, release_month, release_year = parse_datacite_dates(
            attributes.get('dates', []))

        # block bogus far-future years/dates
        if release_year is not None and (release_year > (self.this_year + 5)
                                         or release_year < 1000):
            release_date = None
            release_month = None
            release_year = None

        # Some records do not use the "dates" field (e.g. micropub), but:
        # "attributes.published" or "attributes.publicationYear"
        if not any((release_date, release_month, release_year)):
            release_date, release_month, release_year = parse_single_date(
                attributes.get('publicationYear'))
            if not any((release_date, release_month, release_year)):
                release_date, release_month, release_year = parse_single_date(
                    attributes.get('published'))

        if not any((release_date, release_month, release_year)):
            print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)

        # Start with clear stages, e.g. published. TODO(martin): we could
        # probably infer a bit more from the relations, e.g.
        # "IsPreviousVersionOf" or "IsNewVersionOf".
        release_stage = 'published'

        # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
        # we might want something else than 'published'. See also:
        # https://support.datacite.org/docs/doi-states.

        # Publisher. A few NA values. A few bogus values.
        publisher = attributes.get('publisher')

        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
            publisher = None
            release_stage = None
        if publisher is not None and len(publisher) > 80:
            # Arbitrary magic value max length. TODO(martin): better heuristic,
            # but factored out; first we have to log misses. Example:
            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE /
            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt
            # werden"
            publisher = None

        if publisher:
            publisher = clean(publisher)

        # Container. For the moment, only ISSN as container.
        container_id = None
        container_name = None

        container = attributes.get('container', {}) or {}
        if container.get('type') in CONTAINER_TYPE_MAP.keys():
            container_type = CONTAINER_TYPE_MAP.get(container['type'])
            if container.get('identifier') and container.get(
                    'identifierType') == 'ISSN':
                issn = container.get('identifier')
                if len(issn) == 8:
                    issn = issn[:4] + "-" + issn[4:]
                issnl = self.issn2issnl(issn)
                if issnl is not None:
                    container_id = self.lookup_issnl(issnl)

                    if container_id is None and container.get('title'):
                        container_name = container.get('title')
                        if isinstance(container_name, list):
                            if len(container_name) > 0:
                                print('[{}] too many container titles: {}'.
                                      format(doi, len(container_name)))
                                container_name = container_name[0]
                        assert isinstance(container_name, str)
                        ce = fatcat_openapi_client.ContainerEntity(
                            issnl=issnl,
                            container_type=container_type,
                            name=container_name,
                        )
                        ce_edit = self.create_container(ce)
                        container_id = ce_edit.ident
                        self._issnl_id_map[issnl] = container_id
                else:
                    # TODO(martin): factor this out into a testable function.
                    # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013
                    container_name = container.get('title')
                    if isinstance(container_name, list):
                        if len(container_name) > 0:
                            print('[{}] too many container titles: {}'.format(
                                doi, len(container_name)))
                            container_name = container_name[0]

        # Exception: https://www.micropublication.org/, see: !MR24.
        if container_id is None and container_name is None:
            if publisher and publisher.lower().startswith('micropublication'):
                container_name = publisher

        # Volume and issue.
        volume = container.get('volume')
        issue = container.get('issue')

        if volume:
            volume = clean(volume)

        if issue:
            issue = clean(issue)

        # Pages.
        pages = None

        first_page = container.get('firstPage')
        last_page = container.get('lastPage')

        if first_page and last_page:
            try:
                _ = int(first_page) < int(last_page)
                pages = '{}-{}'.format(first_page, last_page)
            except ValueError as err:  # noqa: F841
                # TODO(martin): This is more debug than info.
                # print('[{}] {}'.format(doi, err), file=sys.stderr)
                pass

        if not pages and first_page:
            pages = first_page

        # License.
        license_slug = None
        license_extra = []

        for lic in attributes.get('rightsList', []):
            slug = lookup_license_slug(lic.get('rightsUri'))
            if slug:
                license_slug = slug
            license_extra.append(lic)

        release_type = self.datacite_release_type(doi, attributes)

        # Language values are varied ("ger", "es", "English", "ENG", "en-us",
        # "other", ...). Try to crush it with langcodes: "It may sound to you
        # like langcodes solves a pretty boring problem. At one level, that's
        # right. Sometimes you have a boring problem, and it's great when a
        # library solves it for you." -- TODO(martin): We need more of these.
        language = None

        value = attributes.get('language', '') or ''
        try:
            language = pycountry.languages.lookup(value).alpha_2
        except (LookupError, AttributeError) as err:  # noqa: F841
            pass
            # TODO(martin): Print this on debug level, only.
            # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr)

        # Abstracts appear in "attributes.descriptions[].descriptionType", some
        # of the observed values: "Methods", "TechnicalInfo",
        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The
        # "Other" fields might contain references or related articles (with
        # DOI). TODO(martin): maybe try to parse out some of those refs.
        abstracts = []
        descs = attributes.get('descriptions', []) or []
        for desc in descs:
            if not desc.get('descriptionType') == 'Abstract':
                continue

            # Description maybe a string, int or list.
            text = desc.get('description', '')
            if not text:
                continue
            if isinstance(text, int):
                text = '{}'.format(text)
            if isinstance(text, list):
                try:
                    text = "\n".join(text)
                except TypeError:
                    continue  # Bail out, if it is not a list of strings.

            # Limit length.
            if len(text) < 10:
                continue
            if len(text) > MAX_ABSTRACT_LENGTH:
                text = text[:MAX_ABSTRACT_LENGTH] + " [...]"

            # Detect language. This is fuzzy and may be removed, if too unreliable.
            lang = None
            try:
                lang = langdetect.detect(text)
            except (langdetect.lang_detect_exception.LangDetectException,
                    TypeError) as err:
                print('[{}] language detection failed with {} on {}'.format(
                    doi, err, text),
                      file=sys.stderr)
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(
                    mimetype="text/plain",
                    content=clean(text),
                    lang=lang,
                ))

        # References and relations. Datacite include many relation types in
        # "attributes.relatedIdentifiers[].relationType", e.g.
        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf",
        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion",
        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart",
        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf",
        # "IsDerivedFrom", "IsSourceOf".
        #
        # For the moment, we only care about References.
        refs, ref_index = [], 0

        relIds = attributes.get('relatedIdentifiers', []) or []
        for rel in relIds:
            if not rel.get('relationType', '') in ('References', 'Cites'):
                continue
            ref_extra = dict()
            if rel.get('relatedIdentifierType', '') == 'DOI':
                ref_extra['doi'] = rel.get('relatedIdentifier')
            if not ref_extra:
                ref_extra = None
            refs.append(
                fatcat_openapi_client.ReleaseRef(
                    index=ref_index,
                    extra=ref_extra,
                ))
            ref_index += 1

        # More specific release_type via 'Reviews' relationsship.
        for rel in relIds:
            if rel.get('relatedIdentifierType', '') != 'Reviews':
                continue
            release_type = 'review'

        # Extra information.
        extra_datacite = dict()

        if license_extra:
            extra_datacite['license'] = license_extra
        if attributes.get('subjects'):
            extra_datacite['subjects'] = attributes['subjects']

        # Include version information.
        metadata_version = attributes.get('metadataVersion') or ''

        if metadata_version:
            extra_datacite['metadataVersion'] = metadata_version

        # Include resource types.
        types = attributes.get('types', {}) or {}
        resource_type = types.get('resourceType', '') or ''
        resource_type_general = types.get('resourceTypeGeneral', '') or ''

        if resource_type and resource_type.lower(
        ) not in UNKNOWN_MARKERS_LOWER:
            extra_datacite['resourceType'] = resource_type
        if resource_type_general and resource_type_general.lower(
        ) not in UNKNOWN_MARKERS_LOWER:
            extra_datacite['resourceTypeGeneral'] = resource_type_general

        # Include certain relations from relatedIdentifiers. Keeping the
        # original structure of data here, which is a list of dicts, with
        # relation type, identifier and identifier type (mostly).
        relations = []
        for rel in relIds:
            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
                                           'IsVariantFormOf', 'IsSupplementTo',
                                           'HasVersion', 'IsMetadataFor',
                                           'IsNewVersionOf', 'IsIdenticalTo',
                                           'IsVersionOf', 'IsDerivedFrom',
                                           'IsSourceOf'):
                relations.append(rel)

        if relations:
            extra_datacite['relations'] = relations

        extra = dict()

        # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0",
        # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555",
        # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st
        # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
        # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
        # "10161", "10010691", "10780", # "Presentación"
        version = attributes.get('version') or None

        # top-level extra keys
        if not container_id and container_name:
            extra['container_name'] = container_name

        # Always include datacite key, even if value is empty (dict).
        extra['datacite'] = extra_datacite

        # Preparation for a schema update.
        if release_month:
            extra['release_month'] = release_month

        extids = self.lookup_ext_ids(doi=doi)

        # Assemble release.
        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            release_type=release_type,
            release_stage=release_stage,
            title=title,
            subtitle=subtitle,
            original_title=original_language_title,
            release_year=release_year,
            release_date=release_date,
            publisher=publisher,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                pmid=extids['pmid'],
                pmcid=extids['pmcid'],
                wikidata_qid=extids['wikidata_qid'],
                core=extids['core_id'],
                arxiv=extids['arxiv_id'],
                jstor=extids['jstor_id'],
            ),
            contribs=contribs,
            volume=volume,
            issue=issue,
            pages=pages,
            language=language,
            abstracts=abstracts,
            refs=refs,
            extra=extra,
            license_slug=license_slug,
            version=version,
        )
        re = self.biblio_hacks(re)
        return re
Beispiel #11
0
    def parse_record(self, record):

        if not record:
            return None
        metadata = record.arXivRaw
        if not metadata:
            return None
        extra = dict()
        extra_arxiv = dict()

        # don't know!
        release_type = "article"

        base_id = metadata.id.string
        doi = None
        if metadata.doi and metadata.doi.string:
            doi = metadata.doi.string.lower().split()[0].strip()
            if not (doi.startswith('10.') and '/' in doi
                    and doi.split('/')[1]):
                sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                doi = None
        title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
        authors = parse_arxiv_authors(metadata.authors.get_text().replace(
            '\n', ' '))
        contribs = [
            fatcat_openapi_client.ReleaseContrib(index=i,
                                                 raw_name=a,
                                                 role='author')
            for i, a in enumerate(authors)
        ]

        lang = "en"  # the vast majority in english
        if metadata.comments and metadata.comments.get_text():
            comments = metadata.comments.get_text().replace('\n', ' ').strip()
            extra_arxiv['comments'] = comments
            if 'in french' in comments.lower():
                lang = 'fr'
            elif 'in spanish' in comments.lower():
                lang = 'es'
            elif 'in portuguese' in comments.lower():
                lang = 'pt'
            elif 'in hindi' in comments.lower():
                lang = 'hi'
            elif 'in japanese' in comments.lower():
                lang = 'ja'
            elif 'in german' in comments.lower():
                lang = 'de'
            elif 'simplified chinese' in comments.lower():
                lang = 'zh'
            elif 'in russian' in comments.lower():
                lang = 'ru'
            # more languages?

        number = None
        if metadata.find('journal-ref') and metadata.find(
                'journal-ref').get_text():
            journal_ref = metadata.find('journal-ref').get_text().replace(
                '\n', ' ').strip()
            extra_arxiv['journal_ref'] = journal_ref
            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(
            ):
                release_type = "paper-conference"
        if metadata.find('report-no') and metadata.find('report-no').string:
            number = metadata.find('report-no').string.strip()
            # at least some people plop extra metadata in here. hrmf!
            if 'ISSN ' in number or 'ISBN ' in number or len(
                    number.split()) > 2:
                extra_arxiv['report-no'] = number
                number = None
            else:
                release_type = "report"
        if metadata.find('acm-class') and metadata.find('acm-class').string:
            extra_arxiv['acm_class'] = metadata.find(
                'acm-class').string.strip()
        if metadata.categories and metadata.categories.get_text():
            extra_arxiv['categories'] = metadata.categories.get_text().split()
        license_slug = None
        if metadata.license and metadata.license.get_text():
            license_slug = lookup_license_slug(metadata.license.get_text())
        abstracts = None
        if metadata.abstract:
            # TODO: test for this multi-abstract code path
            abstracts = []
            abst = metadata.abstract.get_text().strip()
            orig = None
            if '-----' in abst:
                both = abst.split('-----')
                abst = both[0].strip()
                orig = both[1].strip()
            if '$' in abst or '{' in abst:
                mime = "application/x-latex"
                abst_plain = latex_to_text(abst)
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(
                        content=abst_plain, mimetype="text/plain", lang="en"))
            else:
                mime = "text/plain"
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(content=abst,
                                                      mimetype=mime,
                                                      lang="en"))
            if orig:
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(content=orig,
                                                          mimetype=mime))
                # indicates that fulltext probably isn't english either
                if lang == 'en':
                    lang = None

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   arxiv: comments, categories, etc
        extra_arxiv['base_id'] = base_id
        extra['superceded'] = True
        extra['arxiv'] = extra_arxiv

        versions = []
        for version in metadata.find_all('version'):
            arxiv_id = base_id + version['version']
            release_date = version.date.string.strip()
            release_date = datetime.datetime.strptime(
                release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
            # TODO: source_type?
            versions.append(
                fatcat_openapi_client.ReleaseEntity(
                    work_id=None,
                    title=title,
                    #original_title
                    version=version['version'],
                    release_type=release_type,
                    release_stage='submitted',
                    release_date=release_date.isoformat(),
                    release_year=release_date.year,
                    ext_ids=fatcat_openapi_client.ReleaseExtIds(
                        arxiv=arxiv_id, ),
                    number=number,
                    language=lang,
                    license_slug=license_slug,
                    abstracts=abstracts,
                    contribs=contribs,
                    extra=extra.copy(),
                ))
        # TODO: assert that versions are actually in order?
        assert versions

        versions[-1].extra.pop('superceded')

        # only apply DOI to most recent version (HACK)
        if doi:
            versions[-1].ext_ids.doi = doi
            if len(versions) > 1:
                versions[-1].release_stage = "accepted"
        return versions
Beispiel #12
0
    def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # Ways to be out of scope (provisionally)
        # journal-issue and journal-volume map to None, but allowed for now
        if obj.get("type") in (
                None,
                "journal",
                "proceedings",
                "standard-series",
                "report-series",
                "book-series",
                "book-set",
                "book-track",
                "proceedings-series",
        ):
            self.counts["skip-release-type"] += 1
            return None

        # Do require the 'title' keys to exist, as release entities do
        if ("title" not in obj) or (not obj["title"]):
            self.counts["skip-blank-title"] += 1
            return None

        release_type = self.map_release_type(obj["type"])

        # contribs
        def do_contribs(obj_list: List[Dict[str, Any]],
                        ctype: str) -> List[ReleaseContrib]:
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if "ORCID" in am.keys():
                    creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
                # Sorry humans :(
                if am.get("given") and am.get("family"):
                    raw_name: Optional[str] = "{} {}".format(
                        am["given"], am["family"])
                elif am.get("family"):
                    raw_name = am["family"]
                else:
                    # TODO: can end up empty
                    raw_name = am.get("name") or am.get("given")
                extra: Dict[str, Any] = dict()
                if ctype == "author":
                    index: Optional[int] = i
                else:
                    index = None
                raw_affiliation = None
                affiliation_list = am.get("affiliation") or []
                # TODO: currently requiring a "name" in all affiliations. Could
                # add ROR support (via identifier) in the near future
                affiliation_list = [a for a in affiliation_list if "name" in a]
                if affiliation_list and len(affiliation_list) > 0:
                    raw_affiliation = affiliation_list[0]["name"]
                    if len(affiliation_list) > 1:
                        # note: affiliation => more_affiliations
                        extra["more_affiliations"] = [
                            clean_str(a["name"]) for a in affiliation_list[1:]
                        ]
                if am.get("sequence") and am.get("sequence") != "additional":
                    extra["seq"] = clean_str(am.get("sequence"))
                assert ctype in ("author", "editor", "translator")
                raw_name = clean_str(raw_name)
                # TODO: what if 'raw_name' is None?
                contribs.append(
                    ReleaseContrib(
                        creator_id=creator_id,
                        index=index,
                        raw_name=raw_name,
                        given_name=clean_str(am.get("given")),
                        surname=clean_str(am.get("family")),
                        raw_affiliation=clean_str(raw_affiliation),
                        role=ctype,
                        extra=extra or None,
                    ))
            return contribs

        contribs = do_contribs(obj.get("author", []), "author")
        contribs.extend(do_contribs(obj.get("editor", []), "editor"))
        contribs.extend(do_contribs(obj.get("translator", []), "translator"))

        # container
        issn = obj.get("ISSN", [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = clean_str(obj.get("publisher"))

        container_name = obj.get("container-title")
        if container_name:
            container_name = clean_str(container_name[0], force_xml=True)
        if not container_name:
            container_name = None
        if (container_id is None and self.create_containers
                and (issnl is not None) and container_name):
            ce = fatcat_openapi_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=container_name,
            )
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id

        # license slug
        license_slug = None
        license_extra = []
        for lic in obj.get("license", []):
            if lic["content-version"] not in ("vor", "unspecified"):
                continue
            slug = lookup_license_slug(lic["URL"])
            if slug:
                license_slug = slug
            if "start" in lic:
                lic["start"] = lic["start"]["date-time"]
            license_extra.append(lic)

        # references
        refs = []
        for i, rm in enumerate(obj.get("reference", [])):
            try:
                year: Optional[int] = int(rm.get("year"))
                # TODO: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year is not None:
                    if year > 2025 or year < 100:
                        year = None
            except (TypeError, ValueError):
                year = None
            ref_extra: Dict[str, Any] = dict()
            key = rm.get("key")
            if key and key.startswith(obj["DOI"].upper()):
                key = key.replace(obj["DOI"].upper() + "-", "")
                key = key.replace(obj["DOI"].upper(), "")
            ref_container_name = rm.get("volume-title")
            if not ref_container_name:
                ref_container_name = rm.get("journal-title")
            elif rm.get("journal-title"):
                ref_extra["journal-title"] = rm["journal-title"]
            if rm.get("DOI"):
                ref_extra["doi"] = rm.get("DOI").lower()
            author = clean_str(rm.get("author"))
            if author:
                ref_extra["authors"] = [author]
            for k in (
                    "editor",
                    "edition",
                    "authority",
                    "version",
                    "genre",
                    "url",
                    "event",
                    "issue",
                    "volume",
                    "date",
                    "accessed_date",
                    "issued",
                    "page",
                    "medium",
                    "collection_title",
                    "chapter_number",
                    "unstructured",
                    "series-title",
                    "volume-title",
            ):
                if clean_str(rm.get(k)):
                    ref_extra[k] = clean_str(rm[k])
            refs.append(
                fatcat_openapi_client.ReleaseRef(
                    index=i,
                    # doing lookups would be a second import pass
                    target_release_id=None,
                    key=key,
                    year=year,
                    container_name=clean_str(ref_container_name),
                    title=clean_str(rm.get("article-title")),
                    locator=clean_str(rm.get("first-page")),
                    # TODO: just dump JSON somewhere here?
                    extra=ref_extra or None,
                ))

        # abstracts
        abstracts = []
        abstract = clean_str(obj.get("abstract"))
        if abstract and len(abstract) > 10:
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(
                    mimetype="application/xml+jats", content=abstract))

        # extra fields
        extra: Dict[str, Any] = dict()
        extra_crossref: Dict[str, Any] = dict()
        # top-level extra keys
        if not container_id:
            if obj.get("container-title"):
                extra["container_name"] = container_name
        for key in "group-title":
            val = obj.get(key)
            if val:
                if type(val) == list:
                    val = val[0]
                if type(val) == str:
                    val = clean_str(val)
                    if val:
                        extra[key] = clean_str(val)
                else:
                    extra[key] = val
        # crossref-nested extra keys
        for key in ("subject", "type", "alternative-id", "archive", "funder"):
            val = obj.get(key)
            if val:
                if type(val) == str:
                    extra_crossref[key] = clean_str(val)
                else:
                    extra_crossref[key] = val
        if license_extra:
            extra_crossref["license"] = license_extra

        if len(obj["title"]) > 1:
            aliases = [clean_str(t) for t in obj["title"][1:]]
            aliases = [t for t in aliases if t]
            if aliases:
                extra["aliases"] = aliases

        # ISBN
        isbn13 = None
        for raw in obj.get("ISBN", []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj["type"] in (
                "journal-article",
                "conference-proceeding",
                "book",
                "dissertation",
                "book-chapter",
        ):
            release_stage: Optional[str] = "published"
        else:
            # unknown
            release_stage = None

        # filter out unreasonably huge releases
        if len(abstracts) > 100:
            self.counts["skip-huge-abstracts"] += 1
            return None
        if len(contribs) > 2000:
            self.counts["skip-huge-contribs"] += 1
            return None
        if len(refs) > 5000:
            self.counts["skip-huge-refs"] += 1
            return None

        # release date parsing is amazingly complex
        raw_date = obj["issued"]["date-parts"][0]
        if not raw_date or not raw_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_year = None
            release_date = None
        elif len(raw_date) == 3:
            release_year = raw_date[0]
            release_date = datetime.date(year=raw_date[0],
                                         month=raw_date[1],
                                         day=raw_date[2])
        else:
            # sometimes only the year is included, not the full date
            release_year = raw_date[0]
            release_date = None

        original_title: Optional[str] = None
        if obj.get("original-title"):
            ot = obj.get("original-title")
            if ot is not None:
                original_title = clean_str(ot[0], force_xml=True)

        title: Optional[str] = None
        if obj.get("title"):
            title = clean_str(obj["title"][0], force_xml=True)
            if not title or len(title) <= 1:
                # title can't be just a single character
                self.counts["skip-blank-title"] += 1
                return None

        doi = clean_doi(obj["DOI"].lower())
        if not doi:
            self.counts["skip-bad-doi"] += 1
            return None

        subtitle = None
        if obj.get("subtitle"):
            subtitle = clean_str(obj["subtitle"][0], force_xml=True)
            if not subtitle or len(subtitle) <= 1:
                # subtitle can't be just a single character
                subtitle = None

        if extra_crossref:
            extra["crossref"] = extra_crossref

        re = ReleaseEntity(
            work_id=None,
            container_id=container_id,
            title=title,
            subtitle=subtitle,
            original_title=original_title,
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            publisher=publisher,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                isbn13=isbn13,
            ),
            volume=clean_str(obj.get("volume")),
            issue=clean_str(obj.get("issue")),
            pages=clean_str(obj.get("page")),
            language=clean_str(obj.get("language")),
            license_slug=license_slug,
            extra=extra or None,
            abstracts=abstracts or None,
            contribs=contribs or None,
            refs=refs or None,
        )
        return re
Beispiel #13
0
    def parse_record(self, record: Any) -> Optional[ReleaseEntity]:
        """
        record is a beautiful soup object
        returns a ReleaseEntity, or None

        In JALC metadata, both English and Japanese records are given for most
        fields.
        """

        extra: Dict[str, Any] = dict()
        extra_jalc: Dict[str, Any] = dict()

        titles = record.find_all("title")
        if not titles:
            return None
        title = titles[0].get_text().replace("\n", " ").strip()
        original_title = None
        if title.endswith("."):
            title = title[:-1]
        if len(titles) > 1:
            original_title = titles[1].get_text().replace("\n", " ").strip()
            if original_title.endswith("."):
                original_title = original_title[:-1]

        doi = None
        if record.doi:
            doi = clean_doi(record.doi.string.strip().lower())
            # TODO: following code is redundant with clean_doi()
            if not doi:
                return None
            if doi.startswith("http://dx.doi.org/"):
                doi = doi.replace("http://dx.doi.org/", "")
            elif doi.startswith("https://dx.doi.org/"):
                doi = doi.replace("https://dx.doi.org/", "")
            elif doi.startswith("http://doi.org/"):
                doi = doi.replace("http://doi.org/", "")
            elif doi.startswith("https://doi.org/"):
                doi = doi.replace("https://doi.org/", "")
            if not (doi.startswith("10.") and "/" in doi):
                sys.stderr.write("bogus JALC DOI: {}\n".format(doi))
                doi = None
        if not doi:
            return None

        people = record.find_all("Person")
        contribs = parse_jalc_persons(people)

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i

        release_year = None
        release_date = None
        date = record.date or None
        if date:
            date = date.string
            if len(date) == 10:
                release_date_date = datetime.datetime.strptime(
                    date["completed-date"], DATE_FMT).date()
                release_year = release_date_date.year
                release_date = release_date_date.isoformat()
            elif len(date) == 4 and date.isdigit():
                release_year = int(date)

        pages = None
        if record.startingPage and record.startingPage.string.strip():
            pages = record.startingPage.string.strip()
            if record.endingPage and record.endingPage.string.strip():
                pages = "{}-{}".format(pages, record.endingPage.string.strip())
        # double check to prevent "-" as pages
        if pages and pages.strip() == "-":
            pages = None

        volume = None
        if record.volume:
            volume = record.volume.string
        issue = None
        if record.number:
            # note: number/issue transform
            issue = record.number.string

        # container
        issn = None
        issn_list = record.find_all("issn")
        if issn_list:
            # if we wanted the other ISSNs, would also need to uniq the list.
            # But we only need one to lookup ISSN-L/container
            issn = issn_list[0].string
        if issn:
            issnl = self.issn2issnl(issn)
        else:
            issnl = None
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)

        publisher = None
        container_name = None
        container_extra: Dict[str, Any] = dict()

        if record.publicationName:
            pubs = [
                p.get_text().replace("\n", " ").strip()
                for p in record.find_all("publicationName") if p.get_text()
            ]
            pubs = [clean_str(p) for p in pubs if p]
            assert pubs
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            if len(pubs) > 1 and is_cjk(pubs[0]):
                # eng/jpn ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            container_name = clean_str(pubs[0])
            if len(pubs) > 1:
                container_extra["original_name"] = clean_str(pubs[1])

        if record.publisher:
            pubs = [
                p.get_text().replace("\n", " ").strip()
                for p in record.find_all("publisher") if p.get_text()
            ]
            pubs = [p for p in pubs if p]
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            if len(pubs) > 1 and is_cjk(pubs[0]):
                # ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            if pubs:
                publisher = clean_str(pubs[0])
                if len(pubs) > 1:
                    container_extra["publisher_aliases"] = pubs[1:]

        if (container_id is None and self.create_containers
                and (issnl is not None) and container_name):
            # name, type, publisher, issnl
            # extra: issnp, issne, original_name, languages, country
            container_extra["country"] = "jp"
            container_extra["languages"] = ["ja"]
            ce = fatcat_openapi_client.ContainerEntity(
                name=container_name,
                container_type="journal",
                publisher=publisher,
                issnl=issnl,
                extra=(container_extra or None),
            )
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            # short-cut future imports in same batch
            self._issnl_id_map[issnl] = container_id

        # the vast majority of works are in japanese
        # TODO: any indication when *not* in japanese?
        lang = "ja"

        # reasonable default for this collection
        release_type = "article-journal"

        # extra:
        #   translation_of
        #   aliases
        #   container_name
        #   group-title
        # always put at least an empty dict here to indicate the DOI registrar
        # (informally)
        extra["jalc"] = extra_jalc

        title = clean_str(title)
        if not title:
            return None

        re = ReleaseEntity(
            work_id=None,
            title=title,
            original_title=clean_str(original_title),
            release_type=release_type,
            release_stage="published",
            release_date=release_date,
            release_year=release_year,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(doi=doi, ),
            volume=volume,
            issue=issue,
            pages=pages,
            publisher=publisher,
            language=lang,
            # license_slug
            container_id=container_id,
            contribs=contribs,
            extra=extra,
        )
        return re
def test_access_redirect_fallback(client: Any, mocker: Any) -> None:

    with open("tests/files/elastic_fulltext_get.json") as f:
        elastic_resp = json.loads(f.read())

    es_raw = mocker.patch(
        "elasticsearch.connection.Urllib3HttpConnection.perform_request"
    )
    es_raw.side_effect = [
        (200, {}, json.dumps(elastic_resp)),
        (200, {}, json.dumps(elastic_resp)),
        (200, {}, json.dumps(elastic_resp)),
        (200, {}, json.dumps(elastic_resp)),
    ]
    fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work")
    fatcat_get_work_raw.side_effect = [
        fatcat_openapi_client.WorkEntity(
            state="active",
            ident="wwwwwwwwwwwwwwwwwwwwwwwwww",
        )
    ] * 4
    fatcat_get_work_releases_raw = mocker.patch(
        "fatcat_openapi_client.DefaultApi.get_work_releases"
    )
    fatcat_get_work_releases_raw.side_effect = [
        [
            fatcat_openapi_client.ReleaseEntity(
                ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
                ext_ids=fatcat_openapi_client.ReleaseExtIds(),
            ),
        ]
    ] * 4
    fatcat_get_release_raw = mocker.patch(
        "fatcat_openapi_client.DefaultApi.get_release"
    )
    fatcat_get_release_raw.side_effect = [
        fatcat_openapi_client.ReleaseEntity(
            state="active",
            ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
            ext_ids=fatcat_openapi_client.ReleaseExtIds(),
            files=[
                fatcat_openapi_client.FileEntity(
                    ident="ffffffffffffffffffffffffff",
                    urls=[
                        fatcat_openapi_client.FileUrl(
                            rel="web",
                            url="https://blarg.example.com",
                        ),
                        fatcat_openapi_client.FileUrl(
                            rel="webarchive",
                            url="https://web.archive.org/web/12345/https://example.com",
                        ),
                        fatcat_openapi_client.FileUrl(
                            rel="archive",
                            url="https://archive.org/download/some/thing.pdf",
                        ),
                    ],
                ),
            ],
        )
    ] * 4

    # redirects should work after API lookup, for both wayback and archive.org
    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com",
        allow_redirects=False,
    )
    assert rv.status_code == 302
    assert (
        rv.headers["Location"]
        == "https://web.archive.org/web/12345id_/https://example.com"
    )

    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf",
        allow_redirects=False,
    )
    assert rv.status_code == 302
    assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf"

    # wrong URLs should still not work, but display a page with helpful links
    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
        allow_redirects=False,
    )
    assert rv.status_code == 404
    assert b"Access Location Not Found" in rv.content
    assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content

    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf",
        allow_redirects=False,
    )
    assert rv.status_code == 404
    assert b"Access Location Not Found" in rv.content
    assert b"archive.org/download/some/thing.else.pdf" in rv.content