Ejemplo n.º 1
0
 def parse_issn_row(self, row):
     """
     row is a python dict (parsed from CSV).
     returns a ContainerEntity
     """
     title = or_none(row['title'])
     issnl = or_none(row['ISSN-L'])
     if title is None or issnl is None:
         return
     extra = dict(
         in_doaj=truthy(row['in_doaj']),
         in_road=truthy(row['in_road']),
         in_norwegian=truthy(row['in_norwegian']),
         language=or_none(row['lang']),
         url=or_none(row['url']),
         ISSNp=or_none(row['ISSN-print']),
         ISSNe=or_none(row['ISSN-electronic']),
         is_oa=truthy(row['is_oa']),
         is_kept=truthy(row['is_kept']),
     )
     ce = fatcat_client.ContainerEntity(issnl=issnl,
                                        name=title,
                                        publisher=or_none(row['publisher']),
                                        abbrev=None,
                                        coden=None,
                                        extra=extra)
     return ce
Ejemplo n.º 2
0
    def parse_record(self, row):
        """
        row is a python dict (parsed from JSON).

        returns a ContainerEntity (or None if invalid or couldn't parse)
        """

        if not row.get('name'):
            # Name is required (by schema)
            return None

        extra = dict()
        for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
            'coden', 'aliases', 'original_name', 'first_year', 'last_year',
            'platform', 'default_license', 'road', 'mimetypes',
            'sherpa_romeo', 'kbart'):
            if row.get(key):
                extra[key] = row[key]
        # TODO: not including for now: norwegian, dois/crossref, ia

        extra_doaj = dict()
        if row.get('doaj'):
            if row['doaj'].get('as_of'):
                extra_doaj['as_of'] = row['doaj']['as_of']
            if row['doaj'].get('works'):
                extra_doaj['works'] = row['doaj']['works']
        if extra_doaj:
            extra['doaj'] = extra_doaj

        extra_ia = dict()
        # TODO: would like an ia.longtail_ia flag
        if row.get('sim'):
            # NB: None case of the .get() here is blech, but othrwise
            # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
            extra_ia['sim'] = {
                'year_spans': row['sim'].get('year_spans'),
            }
        if extra_ia:
            extra['ia'] = extra_ia

        name = clean(row.get('name'))
        if not name:
            return None

        ce = fatcat_client.ContainerEntity(
            issnl=row['issnl'],
            container_type=None, # TODO
            name=name,
            publisher=clean(row.get('publisher')),
            wikidata_qid=None, # TODO
            extra=extra)
        return ce
Ejemplo n.º 3
0
    def parse_crossref_dict(self, obj):
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # This work is out of scope if it doesn't have authors and a title
        if (not 'author' in obj) or (not 'title' in obj):
            return None

        # Other ways to be out of scope (provisionally)
        if (not 'type' in obj):
            return None

        # contribs
        def do_contribs(obj_list, ctype):
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if 'ORCID' in am.keys():
                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
                # Sorry humans :(
                if am.get('given') and am.get('family'):
                    raw_name = "{} {}".format(am['given'], am['family'])
                elif am.get('family'):
                    raw_name = am['family']
                else:
                    # TODO: defaults back to a pseudo-null value
                    raw_name = am.get('given', '<blank>')
                extra = dict()
                if ctype == "author":
                    index = i
                else:
                    index = None
                if am.get('affiliation'):
                    # note: affiliation => affiliations
                    extra['affiliations'] = am.get('affiliation')
                if am.get('sequence') and am.get('sequence') != "additional":
                    extra['sequence'] = am.get('sequence')
                if not extra:
                    extra = None
                contribs.append(
                    fatcat_client.ReleaseContrib(creator_id=creator_id,
                                                 index=index,
                                                 raw_name=raw_name,
                                                 role=ctype,
                                                 extra=extra))
            return contribs

        contribs = do_contribs(obj['author'], "author")
        contribs.extend(do_contribs(obj.get('editor', []), "editor"))
        contribs.extend(do_contribs(obj.get('translator', []), "translator"))

        # container
        issn = obj.get('ISSN', [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = obj.get('publisher')

        ce = None
        if (container_id is None and self.create_containers and issnl != None
                and obj.get('container-title')
                and len(obj['container-title']) > 0):
            ce = fatcat_client.ContainerEntity(issnl=issnl,
                                               publisher=publisher,
                                               name=obj['container-title'][0])

        # references
        refs = []
        for i, rm in enumerate(obj.get('reference', [])):
            try:
                year = int(rm.get('year'))
                # NOTE: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year > 2025 or year < 100:
                    year = None
            except:
                year = None
            extra = rm.copy()
            if rm.get('DOI'):
                extra['doi'] = rm.get('DOI').lower()
            key = rm.get('key')
            if key and key.startswith(obj['DOI'].upper()):
                key = key.replace(obj['DOI'].upper() + "-", '')
                key = key.replace(obj['DOI'].upper(), '')
            container_name = rm.get('volume-title')
            if not container_name:
                container_name = rm.get('journal-title')
            extra.pop('DOI', None)
            extra.pop('key', None)
            extra.pop('year', None)
            extra.pop('volume-name', None)
            extra.pop('journal-title', None)
            extra.pop('title', None)
            extra.pop('first-page', None)
            extra.pop('doi-asserted-by', None)
            if extra:
                extra = dict(crossref=extra)
            else:
                extra = None
            refs.append(
                fatcat_client.ReleaseRef(
                    index=i,
                    # doing lookups would be a second import pass
                    target_release_id=None,
                    key=key,
                    year=year,
                    container_name=container_name,
                    title=rm.get('title'),
                    locator=rm.get('first-page'),
                    # TODO: just dump JSON somewhere here?
                    extra=extra))

        # abstracts
        abstracts = []
        if obj.get('abstract') != None:
            abstracts.append(
                fatcat_client.ReleaseEntityAbstracts(
                    mimetype="application/xml+jats",
                    content=obj.get('abstract')))

        # extra fields
        extra = dict()
        for key in ('subject', 'type', 'license', 'alternative-id',
                    'container-title', 'original-title', 'subtitle', 'archive',
                    'funder', 'group-title'):
            # TODO: unpack "container-title" array
            val = obj.get(key)
            if val:
                extra[key] = val
        if 'license' in extra and extra['license']:
            for i in range(len(extra['license'])):
                if 'start' in extra['license'][i]:
                    extra['license'][i]['start'] = extra['license'][i][
                        'start']['date-time']
        if len(obj['title']) > 1:
            extra['other-titles'] = obj['title'][1:]
        # TODO: this should be top-level
        extra['is_kept'] = len(obj.get('archive', [])) > 0

        # ISBN
        isbn13 = None
        for raw in obj.get('ISBN', []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
                           'dissertation', 'book-chapter'):
            release_status = "published"
        else:
            # unknown
            release_status = None

        # external identifiers
        extids = self.lookup_ext_ids(doi=obj['DOI'].lower())

        # TODO: filter out huge releases; we'll get them later (and fix bug in
        # fatcatd)
        if max(len(contribs), len(refs), len(abstracts)) > 750:
            return None

        # release date parsing is amazingly complex
        release_date = obj['issued']['date-parts'][0]
        if not release_date or not release_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_date = None
        elif len(release_date) == 3:
            release_date = datetime.datetime(year=release_date[0],
                                             month=release_date[1],
                                             day=release_date[2])
        else:
            # only the year is actually required; mangle to first day for date
            # (TODO: something better?)
            release_date = datetime.datetime(year=release_date[0],
                                             month=1,
                                             day=1)
        # convert to string ISO datetime format (if not null)
        if release_date:
            release_date = release_date.isoformat() + "Z"

        re = fatcat_client.ReleaseEntity(work_id=None,
                                         title=obj['title'][0],
                                         contribs=contribs,
                                         refs=refs,
                                         container_id=container_id,
                                         publisher=publisher,
                                         release_type=obj['type'],
                                         release_status=release_status,
                                         doi=obj['DOI'].lower(),
                                         isbn13=isbn13,
                                         core_id=extids['core_id'],
                                         pmid=extids['pmid'],
                                         pmcid=extids['pmcid'],
                                         wikidata_qid=extids['wikidata_qid'],
                                         release_date=release_date,
                                         issue=obj.get('issue'),
                                         volume=obj.get('volume'),
                                         pages=obj.get('page'),
                                         abstracts=abstracts,
                                         extra=dict(crossref=extra))
        return (re, ce)
Ejemplo n.º 4
0
    def parse_record(self, obj):
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # Ways to be out of scope (provisionally)
        # journal-issue and journal-volume map to None, but allowed for now
        if obj.get('type') in (None, 'journal', 'proceedings',
                'standard-series', 'report-series', 'book-series', 'book-set',
                'book-track', 'proceedings-series'):
            return None

        # Do require the 'title' keys to exsit, as release entities do
        if (not 'title' in obj) or (not obj['title']):
            return None

        release_type = self.map_release_type(obj['type'])

        # contribs
        def do_contribs(obj_list, ctype):
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if 'ORCID' in am.keys():
                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
                # Sorry humans :(
                if am.get('given') and am.get('family'):
                    raw_name = "{} {}".format(am['given'], am['family'])
                elif am.get('family'):
                    raw_name = am['family']
                else:
                    # TODO: can end up empty
                    raw_name = am.get('given')
                extra = dict()
                if ctype == "author":
                    index = i
                else:
                    index = None
                raw_affiliation = None
                if am.get('affiliation'):
                    if len(am.get('affiliation')) > 0:
                        raw_affiliation = am.get('affiliation')[0]['name']
                    if len(am.get('affiliation')) > 1:
                        # note: affiliation => more_affiliations
                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
                if am.get('sequence') and am.get('sequence') != "additional":
                    extra['seq'] = clean(am.get('sequence'))
                if not extra:
                    extra = None
                assert ctype in ("author", "editor", "translator")
                raw_name = clean(raw_name)
                contribs.append(fatcat_client.ReleaseContrib(
                    creator_id=creator_id,
                    index=index,
                    raw_name=raw_name,
                    raw_affiliation=clean(raw_affiliation),
                    role=ctype,
                    extra=extra))
            return contribs
        contribs = do_contribs(obj.get('author', []), "author")
        contribs.extend(do_contribs(obj.get('editor', []), "editor"))
        contribs.extend(do_contribs(obj.get('translator', []), "translator"))

        # container
        issn = obj.get('ISSN', [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = clean(obj.get('publisher'))

        if (container_id is None and self.create_containers and (issnl is not None)
            and obj.get('container-title') and len(obj['container-title']) > 0):
            ce = fatcat_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=clean(obj['container-title'][0], force_xml=True))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident

        # license slug
        license_slug = None
        license_extra = []
        for l in obj.get('license', []):
            if l['content-version'] not in ('vor', 'unspecified'):
                continue
            slug = lookup_license_slug(l['URL'])
            if slug:
                license_slug = slug
            if 'start' in l:
                l['start'] = l['start']['date-time']
            license_extra.append(l)

        # references
        refs = []
        for i, rm in enumerate(obj.get('reference', [])):
            try:
                year = int(rm.get('year'))
                # TODO: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year > 2025 or year < 100:
                    year = None
            except:
                year = None
            ref_extra = dict()
            key = rm.get('key')
            if key and key.startswith(obj['DOI'].upper()):
                key = key.replace(obj['DOI'].upper() + "-", '')
                key = key.replace(obj['DOI'].upper(), '')
            container_name = rm.get('volume-title')
            if not container_name:
                container_name = rm.get('journal-title')
            elif rm.get('journal-title'):
                ref_extra['journal-title'] = rm['journal-title']
            if rm.get('DOI'):
                ref_extra['doi'] = rm.get('DOI').lower()
            author = clean(rm.get('author'))
            if author:
                ref_extra['authors'] = [author]
            for k in ('editor', 'edition', 'authority', 'version', 'genre',
                    'url', 'event', 'issue', 'volume', 'date', 'accessed_date',
                    'issued', 'page', 'medium', 'collection_title', 'chapter_number',
                    'unstructured', 'series-title', 'volume-title'):
                if clean(rm.get(k)):
                    ref_extra[k] = clean(rm[k])
            if not ref_extra:
                ref_extra = None
            refs.append(fatcat_client.ReleaseRef(
                index=i,
                # doing lookups would be a second import pass
                target_release_id=None,
                key=key,
                year=year,
                container_name=clean(container_name),
                title=clean(rm.get('article-title')),
                locator=clean(rm.get('first-page')),
                # TODO: just dump JSON somewhere here?
                extra=ref_extra))

        # abstracts
        abstracts = []
        abstract = clean(obj.get('abstract'))
        if abstract and len(abstract) > 10:
            abstracts.append(fatcat_client.ReleaseEntityAbstracts(
                mimetype="application/xml+jats",
                content=abstract))

        # extra fields
        extra = dict()
        extra_crossref = dict()
        # top-level extra keys
        if not container_id:
            if obj.get('container-title'):
                extra['container_name'] = clean(obj['container-title'][0])
        for key in ('group-title', 'subtitle'):
            val = obj.get(key)
            if val:
                if type(val) == list:
                    val = val[0]
                if type(val) == str:
                    extra[key] = clean(val)
                else:
                    extra[key] = val
        # crossref-nested extra keys
        for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
            val = obj.get(key)
            if val:
                if type(val) == str:
                    extra_crossref[key] = clean(val)
                else:
                    extra_crossref[key] = val
        if license_extra:
            extra_crossref['license'] = license_extra

        if len(obj['title']) > 1:
            aliases = [clean(t) for t in obj['title'][1:]]
            aliases = [t for t in aliases if t]
            if aliases:
                extra['aliases'] = aliases

        # ISBN
        isbn13 = None
        for raw in obj.get('ISBN', []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
                'dissertation', 'book-chapter'):
            release_status = "published"
        else:
            # unknown
            release_status = None

        # external identifiers
        extids = self.lookup_ext_ids(doi=obj['DOI'].lower())

        # filter out unreasonably huge releases
        if len(abstracts) > 100:
            return None
        if len(refs) > 2000:
            return None
        if len(refs) > 5000:
            return None

        # release date parsing is amazingly complex
        raw_date = obj['issued']['date-parts'][0]
        if not raw_date or not raw_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_year = None
            release_date = None
        elif len(raw_date) == 3:
            release_year = raw_date[0]
            release_date = datetime.date(year=raw_date[0], month=raw_date[1], day=raw_date[2])
        else:
            # sometimes only the year is included, not the full date
            release_year = raw_date[0]
            release_date = None


        original_title = None
        if obj.get('original-title'):
            original_title = clean(obj.get('original-title')[0], force_xml=True)

        title = None
        if obj.get('title'):
            title = clean(obj.get('title')[0], force_xml=True)
            if not title or len(title) <= 1:
                # title can't be just a single character
                return None

        if extra_crossref:
            extra['crossref'] = extra_crossref
        if not extra:
            extra = None

        re = fatcat_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            title=title,
            original_title=original_title,
            release_type=release_type,
            release_status=release_status,
            release_date=release_date,
            release_year=release_year,
            publisher=publisher,
            doi=obj['DOI'].lower(),
            pmid=extids['pmid'],
            pmcid=extids['pmcid'],
            wikidata_qid=extids['wikidata_qid'],
            isbn13=isbn13,
            core_id=extids['core_id'],
            arxiv_id=extids['arxiv_id'],
            jstor_id=extids['jstor_id'],
            volume=clean(obj.get('volume')),
            issue=clean(obj.get('issue')),
            pages=clean(obj.get('page')),
            language=clean(obj.get('language')),
            license_slug=license_slug,
            extra=extra,
            abstracts=abstracts,
            contribs=contribs,
            refs=refs,
        )
        return re