Beispiel #1
0
class Link(Parser):
    url = tools.RunPython('format_doi', ctx)
    # identifier will always be DOI
    type = tools.Static('doi')

    def format_doi(self, doi):
        return format_doi_as_url(self, doi)
Beispiel #2
0
class Link(Parser):
    url = tools.RunPython('format_url', ctx)
    type = tools.Static('provider')

    def format_url(self, ctx):
        return 'https://www.nsf.gov/awardsearch/showAward?AWD_ID={}'.format(
            ctx['id'])
Beispiel #3
0
class FunderAgent(Parser):
    schema = tools.GuessAgentType(tools.OneOf(ctx.funderName,
                                              ctx.contributorName),
                                  default='organization')

    name = tools.OneOf(ctx.funderName, ctx.contributorName)

    identifiers = tools.Map(
        tools.Delegate(AgentIdentifier),
        tools.Try(tools.IRI(
            tools.OneOf(ctx.funderIdentifier,
                        tools.RunPython(force_text, ctx.nameIdentifier),
                        tools.Static(None))),
                  exceptions=(ValueError, )))

    class Extra:
        name_identifier = tools.Try(ctx.nameIdentifier)
        name_identifier_scheme = tools.Try(
            ctx.nameIdentifier['@nameIdentifierScheme'])
        name_identifier_scheme_uri = tools.Try(
            ctx.nameIdentifier['@schemeURI'])

        funder_identifier = tools.Try(ctx.funderIdentifier)
        funder_identifier_type = tools.Try(ctx.funderIdentifierType)

        contributor_type = tools.Try(ctx.contributorType)
Beispiel #4
0
    class Extra:
        name_identifier = tools.Try(ctx.nameIdentifier)
        name_identifier_scheme = tools.Try(
            ctx.nameIdentifier['@nameIdentifierScheme'])
        name_identifier_scheme_uri = tools.Try(
            ctx.nameIdentifier['@schemeURI'])

        contributor_type = tools.Try(ctx.contributorType)

        # v.4 new givenName and familyName properties
        given_name = tools.OneOf(ctx.creatorName['@givenName'],
                                 ctx.contributorName['@givenName'],
                                 tools.Static(None))
        family_name = tools.OneOf(ctx.creatorName['@familyName'],
                                  ctx.contributorName['@familyName'],
                                  tools.Static(None))
Beispiel #5
0
class CreativeWork(Parser):
    schema = tools.RunPython('get_type', ctx)

    title = tools.RunPython('get_title', ctx)
    description = Soup(ctx, 'p', class_='genericfile_description')['#text']
    date_published = tools.ParseDate(
        Soup(ctx, itemprop='datePublished')['#text'])
    date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text'])
    rights = tools.OneOf(tools.RunPython('get_rights_url', ctx),
                         tools.RunPython('get_dd', ctx, 'Rights')['#text'],
                         tools.Static(None))
    language = tools.Try(
        tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text']))

    tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx,
                                                       itemprop='keywords'))

    identifiers = tools.Map(
        tools.Delegate(WorkIdentifier),
        tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')),
    )

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')),
        tools.Map(tools.Delegate(Contributor), Soup(ctx,
                                                    itemprop='contributor')),
        tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')),
    )

    class Extra:
        gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text']
        related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text']
        previous_publication_information = tools.RunPython(
            'get_dd', ctx, 'Previous Publication Information')['#text']
        depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text']
        characterization = tools.RunPython('get_dd', ctx,
                                           'Characterization')['#text']

    def get_type(self, obj):
        return {
            'http://schema.org/CreativeWork': 'CreativeWork',
            'http://schema.org/Article': 'Article',
            'http://schema.org/Book': 'Book',
        }.get(obj.soup.find('div')['itemtype'], 'CreativeWork')

    def get_title(self, obj):
        title = obj.h1.soup
        title.find('span', class_='label').decompose()
        return title.get_text()

    def get_dd(self, obj, dt):
        dt_tag = obj.soup.find('dt', string=dt)
        if dt_tag:
            return SoupXMLDict(soup=dt_tag.find_next_sibling('dd'))
        return None

    def get_rights_url(self, obj):
        dd = self.get_dd(obj, 'Rights')
        return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
Beispiel #6
0
class Preprint(Parser):
    title = tools.Try(ctx['DC.Title'])
    description = tools.Try(ctx['DC.Description'])
    # is_deleted
    date_published = tools.ParseDate(tools.Try(ctx['article:published_time']))
    date_updated = tools.ParseDate(tools.Try(ctx['DC.Date']))
    # free_to_read_type
    # free_to_read_date
    rights = tools.Try(ctx['DC.Rights'])
    language = tools.Try(ctx['DC.Language'])

    subjects = tools.Map(tools.Delegate(ThroughSubjects),
                         tools.Static('Biology'),
                         tools.Subjects(tools.Try(ctx['subject-areas'])))
    tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx['category']),
                     tools.Try(ctx['subject-areas']))

    identifiers = tools.Map(tools.Delegate(WorkIdentifier),
                            tools.Try(ctx['og:url']),
                            ctx['citation_public_url'], ctx['citation_doi'])

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(Publisher), tools.Try(ctx['DC.Publisher'])),
        tools.Map(tools.Delegate(Creator),
                  tools.RunPython('get_contributors', ctx)))

    # related_works

    class Extra:
        identifiers = ctx['DC.Identifier']
        access_rights = ctx['DC.AccessRights']

    def get_contributors(self, link):
        authors = link.get('citation_author', []) if isinstance(
            link.get('citation_author', []),
            list) else [link['citation_author']]
        institutions = link.get(
            'citation_author_institution', []) if isinstance(
                link.get('citation_author_institution', []),
                list) else [link['citation_author_institution']]
        emails = link.get('citation_author_email', []) if isinstance(
            link.get('citation_author_email', []),
            list) else [link['citation_author_email']]

        contribs = []
        for author, email, institution in itertools.zip_longest(
                authors, emails, institutions):
            contrib = {
                'author': author,
                'institution': institution,
                'email': email,
            }
            contribs.append(contrib)

        return contribs
Beispiel #7
0
class Preprint(Parser):
    title = ctx.item['dc:title']
    description = ctx.item.description
    contributors = tools.Map(tools.Delegate(Contributor),
                             ctx.item['dc:creator'])
    date_published = ctx.item['dc:date']
    publishers = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Publisher))),
        ctx.item['dc:publisher'])
    links = tools.Map(tools.Delegate(ThroughLinks), ctx.item['dc:identifier'])
    subjects = tools.Map(
        tools.Delegate(ThroughSubjects),
        tools.Concat(tools.Static('Biology and life sciences')))
Beispiel #8
0
class Preprint(Parser):
    title = ctx.item['dc:title']
    description = ctx.item.description
    date_published = tools.ParseDate(ctx.item['dc:date'])
    date_updated = tools.ParseDate(ctx.item['dc:date'])

    subjects = tools.Map(
        tools.Delegate(ThroughSubjects),
        tools.Concat(tools.Static('Biology'))
    )

    identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.item['dc:identifier'])

    related_agents = tools.Concat(
        tools.Delegate(Publisher, ctx.item['dc:publisher']),
        tools.Map(tools.Delegate(Creator), ctx.item['dc:creator']),
    )
Beispiel #9
0
class Preprint(Parser):

    title = ctx.attributes.title
    description = ctx.attributes.description
    contributors = tools.Map(tools.Delegate(Contributor), ctx.contributors)
    institutions = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Institution))),
        ctx.embeds.affiliated_institutions.data)
    # rights = tools.Try(ctx.attributes.node_license)
    date_updated = tools.ParseDate(ctx.attributes.date_modified)
    links = tools.Map(tools.Delegate(ThroughLinks), ctx.links.html)
    tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category,
                     ctx.attributes.tags)
    subjects = tools.Map(tools.Delegate(ThroughSubjects),
                         tools.Static('Engineering and technology'))

    class Extra:
        date_created = tools.ParseDate(ctx.attributes.date_created)
        date_modified = ctx.attributes.date_modified
Beispiel #10
0
class CreativeWork(Parser):
    title = ctx.attributes.title
    description = ctx.attributes.description
    is_deleted = tools.Static(False)
    # date_published =
    date_updated = tools.ParseDate(ctx.attributes.date_modified)
    # free_to_read_type =
    # free_to_read_date =
    # rights = tools.Try(ctx.attributes.node_license)  Doesn't seem to have an useful information
    # language =

    identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.html,
                            ctx.links.self)

    tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category,
                     ctx.attributes.tags)

    class Extra:
        date_created = tools.ParseDate(ctx.attributes.date_created)
Beispiel #11
0
class Preprint(osf.Project):
    description = tools.Try(ctx.attributes.abstract)
    date_updated = tools.ParseDate(ctx.attributes.date_modified)
    date_published = tools.ParseDate(ctx.attributes.date_created)
    # NOTE: OSF has a direct mapping to SHARE's taxonomy. Subjects() is not needed
    subjects = tools.Map(tools.Delegate(ThroughSubjects),
                         ctx.attributes.subjects)
    identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.self,
                            ctx.links.html, tools.Try(ctx.links.doi))
    tags = tools.Map(tools.Delegate(ThroughTags),
                     tools.Try(ctx.attributes.tags))
    rights = tools.Try(ctx.attributes.node_license)

    related_works = tools.Static([])
    related_agents = tools.Concat(
        tools.Map(
            tools.Delegate(osf.Creator),
            tools.Filter(lambda x: x['attributes']['bibliographic'],
                         ctx.contributors)),
        tools.Map(
            tools.Delegate(osf.Contributor),
            tools.Filter(lambda x: not x['attributes']['bibliographic'],
                         ctx.contributors)),
    )
Beispiel #12
0
class OAICreativeWork(Parser):
    default_type = None
    type_map = None

    schema = tools.RunPython(
        'get_schema',
        tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None)))

    title = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(ctx.record.metadata.dc['dc:title'])))
    description = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(ctx.record.metadata.dc['dc:description'])))

    identifiers = tools.Map(
        tools.Delegate(OAIWorkIdentifier),
        tools.Unique(
            tools.Map(
                tools.Try(tools.IRI(), exceptions=(ValueError, )),
                tools.Filter(
                    not_citation,
                    tools.RunPython(
                        'force_text',
                        tools.Concat(
                            tools.Try(ctx.record.metadata.dc['dc:identifier']),
                            tools.Try(ctx.record.header['identifier'])))))))

    related_works = tools.Concat(
        tools.Map(
            tools.Delegate(OAIWorkRelation),
            tools.Unique(
                tools.Map(tools.Try(tools.IRI(), exceptions=(ValueError, )),
                          tools.RunPython('get_relation', ctx)))))

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(OAICreator),
                  tools.Try(ctx.record.metadata.dc['dc:creator'])),
        tools.Map(tools.Delegate(OAIContributor),
                  tools.Try(ctx.record.metadata.dc['dc:contributor'])),
        tools.Map(
            tools.Delegate(OAIPublisher),
            tools.RunPython('force_text',
                            tools.Try(
                                ctx.record.metadata.dc['dc:publisher']))),
    )

    rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights']))

    # Note: this is only taking the first language in the case of multiple languages
    language = tools.ParseLanguage(
        tools.Try(ctx.record.metadata.dc['dc:language'][0]), )

    subjects = tools.Map(
        tools.Delegate(OAIThroughSubjects),
        tools.Subjects(
            tools.Map(
                tools.RunPython('tokenize'),
                tools.RunPython(
                    'force_text',
                    tools.Concat(
                        tools.Try(ctx.record.header.setSpec),
                        tools.Try(ctx.record.metadata.dc['dc:type']),
                        tools.Try(ctx.record.metadata.dc['dc:format']),
                        tools.Try(ctx.record.metadata.dc['dc:subject']),
                    )))))

    tags = tools.Map(
        tools.Delegate(OAIThroughTags),
        tools.Concat(tools.Map(
            tools.RunPython('tokenize'),
            tools.RunPython(
                'force_text',
                tools.Concat(
                    tools.Try(ctx.record.header.setSpec),
                    tools.Try(ctx.record.metadata.dc['dc:type']),
                    tools.Try(ctx.record.metadata.dc['dc:format']),
                    tools.Try(ctx.record.metadata.dc['dc:subject']),
                ))),
                     deep=True))

    date_updated = tools.ParseDate(ctx.record.header.datestamp)

    is_deleted = tools.RunPython('check_status',
                                 tools.Try(ctx.record.header['@status']))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        # An agent responsible for making contributions to the resource.
        contributor = tools.Try(ctx.record.metadata.dc['dc:contributor'])

        # The spatial or temporal topic of the resource, the spatial applicability of the resource,
        # or the jurisdiction under which the resource is relevant.
        coverage = tools.Try(ctx.record.metadata.dc['dc:coverage'])

        # An agent primarily responsible for making the resource.
        creator = tools.Try(ctx.record.metadata.dc['dc:creator'])

        # A point or period of time associated with an event in the lifecycle of the resource.
        dates = tools.Try(ctx.record.metadata.dc['dc:date'])

        # The file format, physical medium, or dimensions of the resource.
        resource_format = tools.Try(ctx.record.metadata.dc['dc:format'])

        # An unambiguous reference to the resource within a given context.
        identifiers = tools.Concat(
            tools.Try(ctx.record.metadata.dc['dc:identifier']),
            tools.Try(ctx.record.header['identifier']))

        # A related resource.
        relation = tools.RunPython('get_relation', ctx)

        # A related resource from which the described resource is derived.
        source = tools.Try(ctx.record.metadata.dc['dc:source'])

        # The nature or genre of the resource.
        resource_type = tools.Try(ctx.record.metadata.dc['dc:type'])

        set_spec = tools.Try(ctx.record.header.setSpec)

        # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage
        language = tools.Try(ctx.record.metadata.dc['dc:language'])

        # Status in the header, will exist if the resource is deleted
        status = tools.Try(ctx.record.header['@status'])

    def check_status(self, status):
        if status == 'deleted':
            return True
        return False

    def get_schema(self, types):
        if not types or not self.type_map:
            return self.default_type
        if isinstance(types, str):
            types = [types]
        for t in types:
            if isinstance(t, dict):
                t = t['#text']
            t = t.lower()
            if t in self.type_map:
                return self.type_map[t]
        return self.default_type

    def force_text(self, data):
        if isinstance(data, dict):
            return data['#text']

        if isinstance(data, str):
            return data

        fixed = []
        for datum in (data or []):
            if datum is None:
                continue
            if isinstance(datum, dict):
                if '#text' not in datum:
                    logger.warn('Skipping %s, no #text key exists', datum)
                    continue
                fixed.append(datum['#text'])
            elif isinstance(datum, str):
                fixed.append(datum)
            else:
                raise Exception(datum)
        return fixed

    def tokenize(self, data):
        if isinstance(data, str):
            data = [data]
        tokens = []
        for item in data:
            tokens.extend(
                [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x])
        return tokens

    def get_relation(self, ctx):
        if not ctx['record'].get('metadata'):
            return []
        relation = ctx['record']['metadata']['dc'].get('dc:relation') or []
        identifiers = ctx['record']['metadata']['dc'].get(
            'dc:identifier') or []
        if isinstance(identifiers, dict):
            identifiers = (identifiers, )
        identifiers = ''.join(i['#text'] if isinstance(i, dict) else i
                              for i in identifiers if i)

        identifiers = re.sub(
            'http|:|/', '',
            identifiers + ctx['record']['header']['identifier'])

        if isinstance(relation, dict):
            relation = (relation['#text'], )

        return [
            r for r in relation
            if r and re.sub('http|:|/', '', r) not in identifiers
        ]
Beispiel #13
0
class Preprint(EngrxivPreprint):
    subjects = tools.Map(tools.Delegate(ThroughSubjects),
                         tools.Static('Social and behavioral sciences'))
Beispiel #14
0
class Preprint(Parser):

    title = tools.Try(ctx['DC.Title'])
    description = tools.Try(ctx['DC.Description'])
    contributors = tools.Map(tools.Delegate(Contributor),
                             tools.RunPython('get_contributors', ctx))

    links = tools.Concat(
        tools.Map(tools.Delegate(ThroughLinks),
                  tools.Concat(ctx['og:url'], ctx['citation_public_url'])),
        tools.Map(tools.Delegate(DoiThroughLinks),
                  tools.Concat(ctx['citation_doi'])))

    publishers = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Publisher))),
        ctx['DC.Publisher'])

    date_updated = tools.ParseDate(ctx['DC.Date'])
    date_published = tools.ParseDate(ctx['article:published_time'])

    language = tools.Try(ctx['DC.Language'])
    rights = tools.Try(ctx['DC.Rights'])

    tags = tools.Map(tools.Delegate(ThroughTags),
                     tools.Concat(tools.Try(ctx['category'])))

    subjects = tools.Map(
        tools.Delegate(ThroughSubjects),
        tools.Static('Biology and life sciences'),
        ctx['subject-areas'],
    )

    class Extra:
        identifiers = ctx['DC.Identifier']
        access_rights = ctx['DC.AccessRights']
        record_type = ctx['type']
        contributors = ctx['DC.Contributor']

        citation_author = ctx['citation_author']
        citation_author_institution = ctx['citation_author_institution']
        citation_author_email = ctx['citation_author_email']

    def get_contributors(self, link):
        authors = link['citation_author'] if isinstance(
            link['citation_author'], list) else [link['citation_author']]
        institutions = link['citation_author_institution'] if isinstance(
            link['citation_author_institution'],
            list) else [link['citation_author_institution']]
        emails = link['citation_author_email'] if isinstance(
            link['citation_author_email'],
            list) else [link['citation_author_email']]

        contribs = []
        for author, email, institution in itertools.zip_longest(
                authors, emails, institutions):
            contrib = {
                'author': author,
                'institution': institution,
                'email': email,
            }
            contribs.append(contrib)

        return contribs
Beispiel #15
0
class Link(Parser):
    url = tools.RunPython('format_link', tools.RunPython(force_text, ctx))
    type = tools.Static('doi')

    def format_link(self, link):
        return format_doi_as_url(self, link)
Beispiel #16
0
class AlternateLink(Parser):
    schema = 'Link'

    url = tools.RunPython(force_text, ctx)
    type = tools.Static('misc')
Beispiel #17
0
class Link(Parser):
    url = ctx
    type = tools.Static('provider')
Beispiel #18
0
class Identifier(Parser):
    url = ctx
    base_url = tools.Static('https://osf.io/')
Beispiel #19
0
class Preprint(normalizer.Project):
    subjects = tools.Map(
        tools.Delegate(ThroughSubjects),
        tools.Concat(tools.Static({'text': 'Social and behavioral sciences'})))