Beispiel #1
0
    class Extra:
        name_identifier = tools.Try(ctx.nameIdentifier)
        name_identifier_scheme = tools.Try(
            ctx.nameIdentifier['@nameIdentifierScheme'])
        name_identifier_scheme_uri = tools.Try(
            ctx.nameIdentifier['@schemeURI'])

        contributor_type = tools.Try(ctx.contributorType)

        # v.4 new givenName and familyName properties
        given_name = tools.OneOf(ctx.creatorName['@givenName'],
                                 ctx.contributorName['@givenName'],
                                 tools.Static(None))
        family_name = tools.OneOf(ctx.creatorName['@familyName'],
                                  ctx.contributorName['@familyName'],
                                  tools.Static(None))
Beispiel #2
0
class CreativeWork(Parser):
    schema = tools.RunPython('get_type', ctx)

    title = tools.RunPython('get_title', ctx)
    description = Soup(ctx, 'p', class_='genericfile_description')['#text']
    date_published = tools.ParseDate(
        Soup(ctx, itemprop='datePublished')['#text'])
    date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text'])
    rights = tools.OneOf(tools.RunPython('get_rights_url', ctx),
                         tools.RunPython('get_dd', ctx, 'Rights')['#text'],
                         tools.Static(None))
    language = tools.Try(
        tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text']))

    tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx,
                                                       itemprop='keywords'))

    identifiers = tools.Map(
        tools.Delegate(WorkIdentifier),
        tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')),
    )

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')),
        tools.Map(tools.Delegate(Contributor), Soup(ctx,
                                                    itemprop='contributor')),
        tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')),
    )

    class Extra:
        gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text']
        related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text']
        previous_publication_information = tools.RunPython(
            'get_dd', ctx, 'Previous Publication Information')['#text']
        depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text']
        characterization = tools.RunPython('get_dd', ctx,
                                           'Characterization')['#text']

    def get_type(self, obj):
        return {
            'http://schema.org/CreativeWork': 'CreativeWork',
            'http://schema.org/Article': 'Article',
            'http://schema.org/Book': 'Book',
        }.get(obj.soup.find('div')['itemtype'], 'CreativeWork')

    def get_title(self, obj):
        title = obj.h1.soup
        title.find('span', class_='label').decompose()
        return title.get_text()

    def get_dd(self, obj, dt):
        dt_tag = obj.soup.find('dt', string=dt)
        if dt_tag:
            return SoupXMLDict(soup=dt_tag.find_next_sibling('dd'))
        return None

    def get_rights_url(self, obj):
        dd = self.get_dd(obj, 'Rights')
        return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
Beispiel #3
0
class MODSAgent(Parser):
    schema = tools.RunPython('get_agent_schema', ctx)

    name = tools.OneOf(tools.RunPython(force_text, ctx['mods:displayForm']),
                       tools.RunPython('squash_name_parts', ctx))

    related_agents = tools.Map(
        tools.Delegate(IsAffiliatedWith),
        tools.Concat(
            tools.Try(
                tools.Filter(
                    lambda x: bool(x),
                    tools.RunPython(force_text, ctx['mods:affiliation'])))))

    identifiers = tools.Map(
        tools.Delegate(MODSAgentIdentifier),
        tools.Unique(
            tools.Map(
                tools.Try(tools.IRI(), exceptions=(ValueError, )),
                tools.Map(
                    tools.RunPython(force_text),
                    tools.Filter(
                        lambda obj: 'invalid' not in obj,
                        tools.Try(ctx['mods:nameIdentifier']),
                    )))))

    class Extra:
        name_type = tools.Try(ctx['@type'])
        name_part = tools.Try(ctx['mods:namePart'])
        affiliation = tools.Try(ctx['mods:affiliation'])
        description = tools.Try(ctx['mods:description'])
        display_form = tools.Try(ctx['mods:displayForm'])
        etal = tools.Try(ctx['mods:etal'])
        name_identifier = tools.Try(ctx['mods:nameIdentifier'])

    def squash_name_parts(self, name):
        name_parts = get_list(name, 'mods:namePart')
        return ' '.join([force_text(n) for n in name_parts])

    def get_agent_schema(self, obj):
        name_type = obj.get('@type')
        if name_type == 'personal':
            return 'person'
        if name_type == 'conference':
            return 'organization'
        # TODO SHARE-718
        # if name_type == 'family':
        #    return 'family'
        if name_type == 'corporate':
            return GuessAgentTypeLink(default='organization').execute(
                self.squash_name_parts(obj))
        return GuessAgentTypeLink().execute(self.squash_name_parts(obj))
Beispiel #4
0
class ContributorAgent(Parser):
    schema = tools.OneOf(
        tools.GuessAgentType(tools.RunPython(get_agent_type, ctx,
                                             person=False),
                             default='organization'),
        tools.GuessAgentType(tools.OneOf(ctx.creatorName,
                                         ctx.contributorName)))

    name = tools.OneOf(ctx.creatorName, ctx.contributorName)
    identifiers = tools.Map(
        tools.Delegate(AgentIdentifier),
        tools.Try(tools.Map(tools.IRI(ctx),
                            tools.RunPython(force_text, ctx.nameIdentifier)),
                  exceptions=(ValueError, )))
    related_agents = tools.Map(
        tools.Delegate(IsAffiliatedWith),
        tools.Concat(
            tools.Try(
                tools.Filter(lambda x: bool(x),
                             tools.RunPython(force_text, ctx.affiliation)))))

    class Extra:
        name_identifier = tools.Try(ctx.nameIdentifier)
        name_identifier_scheme = tools.Try(
            ctx.nameIdentifier['@nameIdentifierScheme'])
        name_identifier_scheme_uri = tools.Try(
            ctx.nameIdentifier['@schemeURI'])

        contributor_type = tools.Try(ctx.contributorType)

        # v.4 new givenName and familyName properties
        given_name = tools.OneOf(ctx.creatorName['@givenName'],
                                 ctx.contributorName['@givenName'],
                                 tools.Static(None))
        family_name = tools.OneOf(ctx.creatorName['@familyName'],
                                  ctx.contributorName['@familyName'],
                                  tools.Static(None))
Beispiel #5
0
class OAICreativeWork(Parser):
    default_type = None
    type_map = None

    schema = tools.RunPython(
        'get_schema',
        tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None)))

    title = tools.Join(
        tools.RunPython(force_text,
                        tools.Try(ctx.record.metadata.dc['dc:title'])))
    description = tools.Join(
        tools.RunPython(force_text,
                        tools.Try(ctx.record.metadata.dc['dc:description'])))

    identifiers = tools.Map(
        tools.Delegate(OAIWorkIdentifier),
        tools.Unique(
            tools.Map(
                tools.Try(tools.IRI(), exceptions=(InvalidIRI, )),
                tools.Filter(
                    not_citation,
                    tools.RunPython(
                        force_text,
                        tools.Concat(
                            tools.Try(ctx.record.metadata.dc['dc:identifier']),
                            tools.Try(ctx.record.header['identifier'])))))))

    related_works = tools.Concat(
        tools.Map(
            tools.Delegate(OAIWorkRelation),
            tools.Unique(
                tools.Map(tools.Try(tools.IRI(), exceptions=(InvalidIRI, )),
                          tools.RunPython('get_relation', ctx)))))

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(OAICreator),
                  tools.Try(ctx.record.metadata.dc['dc:creator'])),
        tools.Map(tools.Delegate(OAIContributor),
                  tools.Try(ctx.record.metadata.dc['dc:contributor'])),
        tools.Map(
            tools.Delegate(OAIPublisher),
            tools.RunPython(force_text,
                            tools.Try(
                                ctx.record.metadata.dc['dc:publisher']))),
    )

    rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights']))

    # Note: this is only taking the first language in the case of multiple languages
    language = tools.ParseLanguage(
        tools.Try(ctx.record.metadata.dc['dc:language'][0]), )

    subjects = tools.Map(
        tools.Delegate(OAIThroughSubjects),
        tools.Subjects(
            tools.Map(
                tools.RunPython('tokenize'),
                tools.RunPython(
                    force_text,
                    tools.Concat(
                        tools.Try(ctx.record.header.setSpec),
                        tools.Try(ctx.record.metadata.dc['dc:type']),
                        tools.Try(ctx.record.metadata.dc['dc:format']),
                        tools.Try(ctx.record.metadata.dc['dc:subject']),
                    )))))

    tags = tools.Map(
        tools.Delegate(OAIThroughTags),
        tools.Concat(tools.Map(
            tools.RunPython('tokenize'),
            tools.RunPython(
                force_text,
                tools.Concat(
                    tools.Try(ctx.record.header.setSpec),
                    tools.Try(ctx.record.metadata.dc['dc:type']),
                    tools.Try(ctx.record.metadata.dc['dc:format']),
                    tools.Try(ctx.record.metadata.dc['dc:subject']),
                ))),
                     deep=True))

    date_updated = tools.ParseDate(ctx.record.header.datestamp)

    is_deleted = tools.RunPython('check_status',
                                 tools.Try(ctx.record.header['@status']))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        # An agent responsible for making contributions to the resource.
        contributor = tools.Try(ctx.record.metadata.dc['dc:contributor'])

        # The spatial or temporal topic of the resource, the spatial applicability of the resource,
        # or the jurisdiction under which the resource is relevant.
        coverage = tools.Try(ctx.record.metadata.dc['dc:coverage'])

        # An agent primarily responsible for making the resource.
        creator = tools.Try(ctx.record.metadata.dc['dc:creator'])

        # A point or period of time associated with an event in the lifecycle of the resource.
        dates = tools.Try(ctx.record.metadata.dc['dc:date'])

        # The file format, physical medium, or dimensions of the resource.
        resource_format = tools.Try(ctx.record.metadata.dc['dc:format'])

        # An unambiguous reference to the resource within a given context.
        identifiers = tools.Concat(
            tools.Try(ctx.record.metadata.dc['dc:identifier']),
            tools.Try(ctx.record.header['identifier']))

        # A related resource.
        relation = tools.RunPython('get_relation', ctx)

        # A related resource from which the described resource is derived.
        source = tools.Try(ctx.record.metadata.dc['dc:source'])

        # The nature or genre of the resource.
        resource_type = tools.Try(ctx.record.metadata.dc['dc:type'])

        set_spec = tools.Try(ctx.record.header.setSpec)

        # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage
        language = tools.Try(ctx.record.metadata.dc['dc:language'])

        # Status in the header, will exist if the resource is deleted
        status = tools.Try(ctx.record.header['@status'])

    def check_status(self, status):
        if status == 'deleted':
            return True
        return False

    def get_schema(self, types):
        if not types or not self.type_map:
            return self.default_type
        if isinstance(types, str):
            types = [types]
        for t in types:
            if isinstance(t, dict):
                t = t['#text']
            t = t.lower()
            if t in self.type_map:
                return self.type_map[t]
        return self.default_type

    def tokenize(self, data):
        if isinstance(data, str):
            data = [data]
        tokens = []
        for item in data:
            tokens.extend(
                [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x])
        return tokens

    def get_relation(self, ctx):
        if not ctx['record'].get('metadata'):
            return []
        relation = ctx['record']['metadata']['dc'].get('dc:relation') or []
        identifiers = ctx['record']['metadata']['dc'].get(
            'dc:identifier') or []
        if isinstance(identifiers, dict):
            identifiers = (identifiers, )
        identifiers = ''.join(i['#text'] if isinstance(i, dict) else i
                              for i in identifiers if i)

        identifiers = re.sub(
            'http|:|/', '',
            identifiers + ctx['record']['header']['identifier'])

        if isinstance(relation, dict):
            relation = (relation['#text'], )

        return [
            r for r in relation
            if r and re.sub('http|:|/', '', r) not in identifiers
        ]
Beispiel #6
0
class MODSCreativeWork(Parser):
    default_type = 'CreativeWork'
    type_map = None
    role_map = None

    schema = tools.RunPython(
        'get_schema',
        tools.OneOf(tools.RunPython(force_text, ctx['mods:genre']),
                    tools.Static(None)))

    title = tools.RunPython('join_title_info', ctx)

    # Abstracts have the optional attribute "shareable". Don't bother checking for it, because
    # abstracts that are not shareable should not have been shared with SHARE.
    description = tools.Join(
        tools.RunPython(force_text, tools.Try(ctx['mods:abstract']), '\n'))

    identifiers = tools.Map(
        tools.Delegate(MODSWorkIdentifier),
        tools.Unique(
            tools.Map(
                tools.Try(tools.IRI(), exceptions=(ValueError, )),
                tools.Map(
                    tools.RunPython(force_text),
                    tools.Filter(
                        lambda obj: 'invalid' not in obj,
                        tools.Concat(
                            tools.Try(ctx['mods:identifier']),
                            tools.Try(ctx.header['identifier']),
                            tools.Try(ctx['mods:location']['mods:url']),
                        ))))))

    related_works = tools.Concat(
        tools.Map(tools.Delegate(MODSWorkRelation),
                  tools.Try(ctx['mods:relatedItem'])))

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(MODSCreator),
                  tools.RunPython('filter_names', ctx, 'creator')),
        tools.Map(tools.Delegate(MODSFunder),
                  tools.RunPython('filter_names', ctx, 'funder')),
        tools.Map(tools.Delegate(MODSHost),
                  tools.RunPython('filter_names', ctx, 'host')),
        tools.Map(tools.Delegate(MODSPublisher),
                  tools.RunPython('filter_names', ctx, 'publisher')),
        tools.Map(
            tools.Delegate(MODSContributor),
            tools.RunPython('filter_names',
                            ctx,
                            'creator',
                            'funder',
                            'host',
                            'publisher',
                            invert=True)),
        tools.Map(
            tools.Delegate(MODSSimplePublisher),
            tools.Try(ctx['mods:originInfo']['mods:publisher']),
        ),
    )

    rights = tools.RunPython(force_text,
                             tools.Try(ctx['mods:accessCondition']), '\n')

    language = tools.ParseLanguage(
        tools.Try(ctx['mods:language']['mods:languageTerm']), )

    subjects = tools.Map(
        tools.Delegate(MODSThroughSubjects),
        tools.Subjects(
            tools.Concat(tools.Try(ctx['mods:subject']['mods:topic']), )))

    tags = tools.Map(
        tools.Delegate(MODSThroughTags),
        tools.Concat(tools.Map(
            tools.RunPython('tokenize'),
            tools.Map(
                tools.RunPython(force_text),
                tools.Try(ctx.header.setSpec),
                tools.Try(ctx['mods:genre']),
                tools.Try(ctx['mods:classification']),
                tools.Try(ctx['mods:subject']['mods:topic']),
            )),
                     deep=True))

    date_updated = tools.ParseDate(tools.Try(ctx.header.datestamp))

    # TODO (in regulator) handle date ranges, uncertain dates ('1904-1941', '1890?', '1980-', '19uu', etc.)
    date_published = tools.OneOf(
        tools.ParseDate(
            tools.RunPython(
                force_text,
                tools.Try(ctx['mods:originInfo']['mods:dateIssued']))),
        tools.Static(None))

    is_deleted = tools.RunPython(lambda status: status == 'deleted',
                                 tools.Try(ctx.record.header['@status']))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """

        # (dc:description) http://www.loc.gov/standards/mods/userguide/abstract.html
        abstract = tools.Try(ctx['mods:abstract'])

        # (dc:rights) http://www.loc.gov/standards/mods/userguide/accesscondition.html
        accessConditions = tools.Try(ctx['mods:accessCondition'])

        # (dc:subject) http://www.loc.gov/standards/mods/userguide/classification.html
        classification = tools.Try(ctx['mods:classification'])

        # (N/A) http://www.loc.gov/standards/mods/userguide/extension.html
        extension = tools.Try(ctx['mods:extension'])

        # SHARE type
        # (dc:type) http://www.loc.gov/standards/mods/userguide/genre.html
        genre = tools.Try(ctx['mods:genre'])

        # (dc:identifier) http://www.loc.gov/standards/mods/userguide/identifier.html
        identifier = tools.Try(ctx['mods:identifier'])

        # (dc:language) http://www.loc.gov/standards/mods/userguide/language.html
        language = tools.Try(ctx['mods:language'])

        # (dc:identifier for url) http://www.loc.gov/standards/mods/userguide/location.html
        location = tools.Try(ctx['mods:location'])

        # (dc:creator|dc:contributor) http://www.loc.gov/standards/mods/userguide/name.html
        name = tools.Try(ctx['mods:name'])

        # (dc:description) http://www.loc.gov/standards/mods/userguide/note.html
        note = tools.Try(ctx['mods:note'])

        # (dc:publisher|dc:date) http://www.loc.gov/standards/mods/userguide/origininfo.html
        originInfo = tools.Try(ctx['mods:originInfo'])

        # Extra
        # (dc:title) http://www.loc.gov/standards/mods/userguide/part.html
        part = tools.Try(ctx['mods:part'])

        # (dc:format or N/A) http://www.loc.gov/standards/mods/userguide/physicaldescription.html
        physicalDescription = tools.Try(ctx['mods:physicalDescription'])

        # Metadata information
        # (N/A) http://www.loc.gov/standards/mods/userguide/recordinfo.html
        recordInfo = tools.Try(ctx['mods:recordInfo'])

        # (dc:relation) http://www.loc.gov/standards/mods/userguide/relateditem.html
        relatedItem = tools.Try(ctx['mods:relatedItem'])

        # (dc:subject|dc:type|dc:coverage|N/A) http://www.loc.gov/standards/mods/userguide/subject.html
        subject = tools.Try(ctx['mods:subject'])

        # (dc:description) http://www.loc.gov/standards/mods/userguide/tableofcontents.html
        tableOfContents = tools.Try(ctx['mods:tableOfContents'])

        # (N/A) http://www.loc.gov/standards/mods/userguide/targetaudience.html
        targetAudience = tools.Try(ctx['mods:targetAudience'])

        # (dc:title) http://www.loc.gov/standards/mods/userguide/titleinfo.html
        titleInfo = tools.Try(ctx['mods:titleInfo'])

        # Extra
        # (dc:type) http://www.loc.gov/standards/mods/userguide/typeofresource.html
        typeOfResource = tools.Try(ctx['mods:typeOfResource'])

    def get_schema(self, types):
        if not types or not self.type_map:
            return self.default_type
        if isinstance(types, str):
            types = [types]
        for t in types:
            if isinstance(t, dict):
                t = t['#text']
            t = t.lower()
            if t in self.type_map:
                return self.type_map[t]
        return self.default_type

    def tokenize(self, data):
        if isinstance(data, str):
            data = [data]
        tokens = []
        for item in data:
            tokens.extend(
                [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x])
        return tokens

    # Map titleInfos to a string: https://www.loc.gov/standards/mods/userguide/titleinfo.html#mappings
    def join_title_info(self, obj):
        def get_part(title_info, part_name, delimiter=''):
            part = force_text(title_info.get(part_name, ''), ' ').strip()
            return delimiter + part if part else ''

        title_infos = get_list(obj, 'mods:titleInfo')
        titles = []
        for title_info in title_infos:
            title = ''
            title += get_part(title_info, 'mods:nonSort')
            title += get_part(title_info, 'mods:title')
            title += get_part(title_info, 'mods:subTitle', ': ')
            title += get_part(title_info, 'mods:partNumber', '. ')
            title += get_part(title_info, 'mods:partName', ': ')
            if title:
                titles.append(title)
        return '. '.join(titles)

    def filter_names(self, obj, *roles, invert=False):
        names = get_list(obj, 'mods:name')
        filtered = [*names] if invert else []
        for name in names:
            name_roles = get_list(name, 'mods:role')
            for role in name_roles:
                role_terms = get_list(role, 'mods:roleTerm')
                name_roles = {force_text(r).lower() for r in role_terms}
                name_roles.update({
                    self.role_map[r]
                    for r in name_roles if r in self.role_map
                })
                if name_roles.intersection(roles):
                    if invert:
                        filtered.remove(name)
                    else:
                        filtered.append(name)
        return filtered
Beispiel #7
0
class ContributorRelation(Parser):
    schema = 'Contributor'

    agent = tools.Delegate(ContributorAgent, ctx)
    cited_as = tools.OneOf(ctx.creatorName, ctx.contributorName)
Beispiel #8
0
class Contributor(Parser):
    agent = tools.Delegate(Person, ctx)
    cited_as = tools.OneOf(
        ctx.embeds.users.data.attributes.full_name,
        ctx.embeds.users.errors[0].meta.full_name,
    )