Python ParseLanguage Examples

Programming Language: Python

Namespace/Package Name: share.normalize.tools

Method/Function: ParseLanguage

Examples at hotexamples.com: 6

Python ParseLanguage - 6 examples found. These are the top rated real world Python examples of share.normalize.tools.ParseLanguage extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: normalizer.py Project: sheriefvt/SHARE

class CreativeWork(Parser):
    schema = tools.RunPython('get_type', ctx)

    title = tools.RunPython('get_title', ctx)
    description = Soup(ctx, 'p', class_='genericfile_description')['#text']
    date_published = tools.ParseDate(
        Soup(ctx, itemprop='datePublished')['#text'])
    date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text'])
    rights = tools.OneOf(tools.RunPython('get_rights_url', ctx),
                         tools.RunPython('get_dd', ctx, 'Rights')['#text'],
                         tools.Static(None))
    language = tools.Try(
        tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text']))

    tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx,
                                                       itemprop='keywords'))

    identifiers = tools.Map(
        tools.Delegate(WorkIdentifier),
        tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')),
    )

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')),
        tools.Map(tools.Delegate(Contributor), Soup(ctx,
                                                    itemprop='contributor')),
        tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')),
    )

    class Extra:
        gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text']
        related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text']
        previous_publication_information = tools.RunPython(
            'get_dd', ctx, 'Previous Publication Information')['#text']
        depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text']
        characterization = tools.RunPython('get_dd', ctx,
                                           'Characterization')['#text']

    def get_type(self, obj):
        return {
            'http://schema.org/CreativeWork': 'CreativeWork',
            'http://schema.org/Article': 'Article',
            'http://schema.org/Book': 'Book',
        }.get(obj.soup.find('div')['itemtype'], 'CreativeWork')

    def get_title(self, obj):
        title = obj.h1.soup
        title.find('span', class_='label').decompose()
        return title.get_text()

    def get_dd(self, obj, dt):
        dt_tag = obj.soup.find('dt', string=dt)
        if dt_tag:
            return SoupXMLDict(soup=dt_tag.find_next_sibling('dd'))
        return None

    def get_rights_url(self, obj):
        dd = self.get_dd(obj, 'Rights')
        return dd.soup.find('i', class_='glyphicon-new-window').parent['href']

Example #2

Show file

File: v1_push.py Project: sheriefvt/SHARE

class CreativeWork(Parser):
    title = ctx.title
    description = tools.Try(ctx.description)
    is_deleted = tools.RunPython('_is_deleted', tools.Try(ctx.otherProperties))
    date_updated = tools.ParseDate(tools.Try(ctx.providerUpdatedDateTime))
    rights = tools.Join(tools.Try(ctx.licenses.uri))

    # Note: this is only taking the first language in the case of multiple languages
    language = tools.ParseLanguage(tools.Try(ctx.languages[0]), )

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(Creator), tools.Try(ctx.contributors)),
        tools.Map(tools.Delegate(Publisher), tools.Try(ctx.publisher)),
        tools.Map(tools.Delegate(Funder), tools.Try(ctx.sponsorships)))

    identifiers = tools.Map(
        tools.Delegate(WorkIdentifier),
        tools.Map(
            tools.IRI(),
            tools.RunPython(
                'unique',
                tools.Concat(tools.Try(ctx.uris.canonicalUri),
                             tools.Try(ctx.uris.providerUris),
                             tools.Try(ctx.uris.descriptorUris),
                             tools.Try(ctx.uris.objectUris)))))

    subjects = tools.Map(tools.Delegate(ThroughSubjects),
                         tools.Subjects(tools.Try(ctx.subjects)))

    tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.tags),
                     tools.Try(ctx.subjects))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        freeToRead = tools.Try(ctx.freeToRead)
        languages = tools.Try(ctx.languages)
        licenses = tools.Try(ctx.licenses)
        otherProperties = tools.Try(ctx.otherProperties)
        publisher = tools.Try(ctx.publisher)
        subjects = tools.Try(ctx.subjects)
        sponsorships = tools.Try(ctx.sponsorships)
        tags = tools.Try(ctx.tags)
        uris = tools.Try(ctx.uris)
        version = tools.Try(ctx.version)

    def unique(self, items):
        return list(sorted(set(items)))

    def _is_deleted(self, properties):
        for prop in properties or []:
            if prop['name'] == 'status':
                return 'deleted' in prop['properties'].get('status', [])
        return False

Example #3

Show file

File: oai.py Project: alexschiller/SHARE

class OAICreativeWork(Parser):
    default_type = None
    type_map = None

    schema = tools.RunPython(
        'get_schema',
        tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None)))

    title = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(ctx.record.metadata.dc['dc:title'])))
    description = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(ctx.record.metadata.dc['dc:description'])))

    identifiers = tools.Map(
        tools.Delegate(OAIWorkIdentifier),
        tools.Unique(
            tools.Map(
                tools.Try(tools.IRI(), exceptions=(ValueError, )),
                tools.Filter(
                    not_citation,
                    tools.RunPython(
                        'force_text',
                        tools.Concat(
                            tools.Try(ctx.record.metadata.dc['dc:identifier']),
                            tools.Try(ctx.record.header['identifier'])))))))

    related_works = tools.Concat(
        tools.Map(
            tools.Delegate(OAIWorkRelation),
            tools.Unique(
                tools.Map(tools.Try(tools.IRI(), exceptions=(ValueError, )),
                          tools.RunPython('get_relation', ctx)))))

    related_agents = tools.Concat(
        tools.Map(tools.Delegate(OAICreator),
                  tools.Try(ctx.record.metadata.dc['dc:creator'])),
        tools.Map(tools.Delegate(OAIContributor),
                  tools.Try(ctx.record.metadata.dc['dc:contributor'])),
        tools.Map(
            tools.Delegate(OAIPublisher),
            tools.RunPython('force_text',
                            tools.Try(
                                ctx.record.metadata.dc['dc:publisher']))),
    )

    rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights']))

    # Note: this is only taking the first language in the case of multiple languages
    language = tools.ParseLanguage(
        tools.Try(ctx.record.metadata.dc['dc:language'][0]), )

    subjects = tools.Map(
        tools.Delegate(OAIThroughSubjects),
        tools.Subjects(
            tools.Map(
                tools.RunPython('tokenize'),
                tools.RunPython(
                    'force_text',
                    tools.Concat(
                        tools.Try(ctx.record.header.setSpec),
                        tools.Try(ctx.record.metadata.dc['dc:type']),
                        tools.Try(ctx.record.metadata.dc['dc:format']),
                        tools.Try(ctx.record.metadata.dc['dc:subject']),
                    )))))

    tags = tools.Map(
        tools.Delegate(OAIThroughTags),
        tools.Concat(tools.Map(
            tools.RunPython('tokenize'),
            tools.RunPython(
                'force_text',
                tools.Concat(
                    tools.Try(ctx.record.header.setSpec),
                    tools.Try(ctx.record.metadata.dc['dc:type']),
                    tools.Try(ctx.record.metadata.dc['dc:format']),
                    tools.Try(ctx.record.metadata.dc['dc:subject']),
                ))),
                     deep=True))

    date_updated = tools.ParseDate(ctx.record.header.datestamp)

    is_deleted = tools.RunPython('check_status',
                                 tools.Try(ctx.record.header['@status']))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        # An agent responsible for making contributions to the resource.
        contributor = tools.Try(ctx.record.metadata.dc['dc:contributor'])

        # The spatial or temporal topic of the resource, the spatial applicability of the resource,
        # or the jurisdiction under which the resource is relevant.
        coverage = tools.Try(ctx.record.metadata.dc['dc:coverage'])

        # An agent primarily responsible for making the resource.
        creator = tools.Try(ctx.record.metadata.dc['dc:creator'])

        # A point or period of time associated with an event in the lifecycle of the resource.
        dates = tools.Try(ctx.record.metadata.dc['dc:date'])

        # The file format, physical medium, or dimensions of the resource.
        resource_format = tools.Try(ctx.record.metadata.dc['dc:format'])

        # An unambiguous reference to the resource within a given context.
        identifiers = tools.Concat(
            tools.Try(ctx.record.metadata.dc['dc:identifier']),
            tools.Try(ctx.record.header['identifier']))

        # A related resource.
        relation = tools.RunPython('get_relation', ctx)

        # A related resource from which the described resource is derived.
        source = tools.Try(ctx.record.metadata.dc['dc:source'])

        # The nature or genre of the resource.
        resource_type = tools.Try(ctx.record.metadata.dc['dc:type'])

        set_spec = tools.Try(ctx.record.header.setSpec)

        # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage
        language = tools.Try(ctx.record.metadata.dc['dc:language'])

        # Status in the header, will exist if the resource is deleted
        status = tools.Try(ctx.record.header['@status'])

    def check_status(self, status):
        if status == 'deleted':
            return True
        return False

    def get_schema(self, types):
        if not types or not self.type_map:
            return self.default_type
        if isinstance(types, str):
            types = [types]
        for t in types:
            if isinstance(t, dict):
                t = t['#text']
            t = t.lower()
            if t in self.type_map:
                return self.type_map[t]
        return self.default_type

    def force_text(self, data):
        if isinstance(data, dict):
            return data['#text']

        if isinstance(data, str):
            return data

        fixed = []
        for datum in (data or []):
            if datum is None:
                continue
            if isinstance(datum, dict):
                if '#text' not in datum:
                    logger.warn('Skipping %s, no #text key exists', datum)
                    continue
                fixed.append(datum['#text'])
            elif isinstance(datum, str):
                fixed.append(datum)
            else:
                raise Exception(datum)
        return fixed

    def tokenize(self, data):
        if isinstance(data, str):
            data = [data]
        tokens = []
        for item in data:
            tokens.extend(
                [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x])
        return tokens

    def get_relation(self, ctx):
        if not ctx['record'].get('metadata'):
            return []
        relation = ctx['record']['metadata']['dc'].get('dc:relation') or []
        identifiers = ctx['record']['metadata']['dc'].get(
            'dc:identifier') or []
        if isinstance(identifiers, dict):
            identifiers = (identifiers, )
        identifiers = ''.join(i['#text'] if isinstance(i, dict) else i
                              for i in identifiers if i)

        identifiers = re.sub(
            'http|:|/', '',
            identifiers + ctx['record']['header']['identifier'])

        if isinstance(relation, dict):
            relation = (relation['#text'], )

        return [
            r for r in relation
            if r and re.sub('http|:|/', '', r) not in identifiers
        ]

Example #4

Show file

class OAICreativeWork(Parser):
    schema = 'CreativeWork'

    ORGANIZATION_KEYWORDS = ('the', 'center')
    INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'institute')

    title = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(
                            ctx['record']['metadata']['dc']['dc:title'])))
    description = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(ctx.record.metadata.dc['dc:description'])))

    publishers = tools.Map(
        tools.Delegate(
            OAIAssociation.using(entity=tools.Delegate(OAIPublisher))),
        tools.Map(tools.RunPython('force_text'),
                  tools.Try(ctx.record.metadata.dc['dc:publisher'])))

    rights = tools.Join(
        tools.Maybe(tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:rights'))

    # Note: this is only taking the first language in the case of multiple languages
    language = tools.ParseLanguage(
        tools.Try(ctx['record']['metadata']['dc']['dc:language'][0]), )

    contributors = tools.Map(
        tools.Delegate(OAIContributor),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:creator'),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:contributor')), 'contributor'))

    institutions = tools.Map(
        tools.Delegate(
            OAIAssociation.using(entity=tools.Delegate(OAIInstitution))),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:creator'),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:contributor')), 'institution'))

    organizations = tools.Map(
        tools.Delegate(
            OAIAssociation.using(entity=tools.Delegate(OAIOrganization))),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:creator'),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:contributor')), 'organization'))

    tags = tools.Map(
        tools.Delegate(OAIThroughTags),
        tools.RunPython(
            'force_text',
            tools.Concat(
                tools.Try(ctx['record']['header']['setSpec']),
                tools.Try(ctx['record']['metadata']['dc']['dc:type']),
                tools.Try(ctx['record']['metadata']['dc']['dc:format']),
                tools.Try(ctx['record']['metadata']['dc']['dc:subject']),
            )))

    links = tools.Map(
        tools.Delegate(OAIThroughLinks),
        tools.RunPython(
            'get_links',
            tools.Concat(
                tools.Try(ctx['record']['metadata']['dc']['dc:identifier']),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:relation'))))

    date_updated = tools.ParseDate(ctx['record']['header']['datestamp'])

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        # An entity responsible for making contributions to the resource.
        contributor = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')

        # The spatial or temporal topic of the resource, the spatial applicability of the resource,
        # or the jurisdiction under which the resource is relevant.
        coverage = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage')

        # An entity primarily responsible for making the resource.
        creator = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator')

        # A point or period of time associated with an event in the lifecycle of the resource.
        dates = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date')

        # The file format, physical medium, or dimensions of the resource.
        resource_format = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format')

        # An unambiguous reference to the resource within a given context.
        identifiers = tools.Concat(
            tools.Try(ctx['record']['metadata']['dc']['dc:identifier']),
            tools.Maybe(ctx['record']['header'], 'identifier'))

        # A related resource.
        relation = tools.RunPython('get_relation', ctx)

        # A related resource from which the described resource is derived.
        source = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source')

        # The topic of the resource.
        subject = tools.Try(ctx.record.metadata.dc['dc:subject'])

        # The nature or genre of the resource.
        resource_type = tools.Try(ctx.record.metadata.dc['dc:type'])

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')

        # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage
        language = tools.Try(ctx.record.metadata.dc['dc:language'])

        # Status in the header, will exist if the resource is deleted
        status = tools.Maybe(ctx.record.header, '@status')

    def get_links(self, ctx):
        links = []
        for link in ctx:
            if not link or not isinstance(link, str):
                continue
            found_url = URL_REGEX.search(link)
            if found_url is not None:
                links.append(found_url.group())
                continue

            found_doi = DOI_REGEX.search(link)
            if found_doi is not None:
                found_doi = found_doi.group()
                if 'dx.doi.org' in found_doi:
                    links.append(found_doi)
                else:
                    links.append('http://dx.doi.org/{}'.format(
                        found_doi.replace('doi:', '')))
        return links

    def force_text(self, data):
        if isinstance(data, dict):
            return data['#text']

        if isinstance(data, str):
            return data

        fixed = []
        for datum in (data or []):
            if datum is None:
                continue
            if isinstance(datum, dict):
                if '#text' not in datum:
                    logger.warn('Skipping %s, no #text key exists', datum)
                    continue
                fixed.append(datum['#text'])
            elif isinstance(datum, str):
                fixed.append(datum)
            else:
                raise Exception(datum)
        return fixed

    def get_relation(self, ctx):
        if not ctx['record'].get('metadata'):
            return []
        relation = ctx['record']['metadata']['dc'].get('dc:relation', [])
        if isinstance(relation, dict):
            return relation['#text']
        return relation

    def get_contributors(self, options, entity):
        """
        Returns list of organization, institutions, or contributors names based on entity type.
        """
        options = [o if isinstance(o, str) else o['#text'] for o in options]

        if entity == 'organization':
            organizations = [
                value for value in options
                if (value and
                    not self.list_in_string(value, self.INSTITUTION_KEYWORDS)
                    and self.list_in_string(value, self.ORGANIZATION_KEYWORDS))
            ]
            return organizations
        elif entity == 'institution':
            institutions = [
                value for value in options
                if (value
                    and self.list_in_string(value, self.INSTITUTION_KEYWORDS))
            ]
            return institutions
        elif entity == 'contributor':
            people = [
                value for value in options
                if (value and not self.list_in_string(
                    value, self.INSTITUTION_KEYWORDS) and
                    not self.list_in_string(value, self.ORGANIZATION_KEYWORDS))
            ]
            return people
        else:
            return options

    def list_in_string(self, string, list_):
        if any(word in string.lower() for word in list_):
            return True
        return False

Example #5

Show file

class CreativeWork(Parser):

    # Schema definitions: http://schema.datacite.org/meta/kernel-3.1/doc/DataCite-MetadataKernel_v3.1.pdf

    PEOPLE_TYPES = ('ContactPerson', 'DataCurator', 'Editor', 'ProjectLeader',
                    'ProjectManager', 'ProjectMember', 'RelatedPerson',
                    'Researcher', 'Supervisor', 'WorkPackageLeader')
    CHECK_TYPES = ('DataCollector', 'DataManager', 'Producer', 'RightsHolder',
                   'Sponsor', 'Other')

    NOT_PEOPLE_TYPES = ('Distributor', 'HostingInstitution',
                        'RegistrationAgency', 'RegistrationAuthority',
                        'ResearchGroup')

    ORGANIZATION_KEYWORDS = (THE_REGEX, 'council', 'center', 'foundation')
    INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'college',
                            'institute')

    title = tools.RunPython(
        force_text,
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title))
    description = tools.RunPython(
        force_text,
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  descriptions.description[0]))

    publishers = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Publisher))),
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher))

    rights = tools.Try(
        tools.Join(
            tools.RunPython(
                'text_list',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.rightsList.rights))))

    language = tools.ParseLanguage(
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.language))

    contributors = tools.Concat(
        tools.Map(
            tools.Delegate(Creator),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.creators.creator)), 'contributor',
                'creator')),
        tools.Map(
            tools.Delegate(Contributor),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                'contributor', 'contributor')))

    institutions = tools.Concat(
        tools.Map(
            tools.Delegate(
                Association.using(entity=tools.Delegate(CreatorInstitution))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.creators.creator)), 'institution',
                'creator')),
        tools.Map(
            tools.Delegate(
                Association.using(
                    entity=tools.Delegate(ContributorInstitution))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                'institution', 'contributor')))

    organizations = tools.Concat(
        tools.Map(
            tools.Delegate(
                Association.using(entity=tools.Delegate(CreatorOrganization))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.creators.creator)), 'organization',
                'creator')),
        tools.Map(
            tools.Delegate(
                Association.using(
                    entity=tools.Delegate(ContributorOrganization))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                'organization', 'contributor')))

    tags = tools.Map(
        tools.Delegate(ThroughTags),
        tools.RunPython(
            force_text,
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx.record, 'metadata')['oai_datacite'],
                    'type'),
                tools.RunPython('text_list', (tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.subjects.subject)))),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          formats.format),
                tools.Try(
                    ctx.record.metadata['oai_datacite'].datacentreSymbol),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['#text']),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['@resourceTypeGeneral']),
                tools.Maybe(ctx.record.header, 'setSpec'),
                tools.Maybe(ctx.record.header, '@status'))))

    links = tools.Concat(
        tools.Map(
            tools.Delegate(ThroughLinks),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          identifier))),
        tools.Map(
            tools.Delegate(ThroughAlternateLinks),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          alternateIdentifiers.alternateidentifier))),
        tools.Map(
            tools.Delegate(ThroughRelatedLinks),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          relatedIdentifiers.relatedIdentifier))))

    date_updated = tools.ParseDate(ctx.record.header.datestamp)
    date_published = tools.ParseDate(
        tools.RunPython(
            'get_date_type',
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          dates.date)), 'Issued'))
    free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                  resource.rightsList.rights['@rightsURI'])
    free_to_read_date = tools.ParseDate(
        tools.RunPython(
            'get_date_type',
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          dates.date)), 'Available'))
    funders = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Funder))),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          contributors.contributor)), 'funder', 'contributor'))
    venues = tools.Map(
        tools.Delegate(ThroughVenues),
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  geoLocations.geoLocation))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        status = tools.Maybe(ctx.record.header, '@status')

        datestamp = tools.ParseDate(ctx.record.header.datestamp)

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')

        is_reference_quality = tools.Try(
            ctx.record.metadata['oai_datacite'].isReferenceQuality)

        schema_version = tools.Try(
            ctx.record.metadata['oai_datacite'].schemaVersion)

        datacentre_symbol = tools.Try(
            ctx.record.metadata['oai_datacite'].datacentreSymbol)

        identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.identifier)

        alternate_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            alternateIdentifiers.alternateidentifier)

        titles = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title)

        publisher = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher)

        publication_year = tools.Try(ctx.record.metadata['oai_datacite'].
                                     payload.resource.publicationYear)

        subject = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                            resource.subjects.subject)

        resourceType = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.resourceType)

        sizes = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.size)

        format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                resource.formats.format)

        version = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.version)

        rights = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rights)

        rightsList = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rightsList)

        related_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            relatedIdentifiers.relatedIdentifier)

        description = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.descriptions)

        dates = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.dates.date)

        contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                 resource.contributors.contributor)

        creators = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.creators)

        geolocations = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.geoLocations)

    def get_date_type(self, date_obj, date_type):
        try:
            date = None
            for obj in date_obj:
                if obj['@dateType'] == date_type:
                    date = obj['#text']
            if date and not DATE_REGEX.search(date):
                return None
        except KeyError:
            return None
        return date

    def text_list(self, data):
        text_list = []
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    if '#text' in item:
                        text_list.append(item['#text'])
                        continue
                elif isinstance(item, str):
                    text_list.append(item)
                    continue
                logger.warning(
                    '#text is not in {} and it is not a string'.format(item))
            return text_list
        else:
            raise Exception('{} is not a list.'.format(data))

    def get_contributors(self, options, entity, field=None):
        """
        Returns list of organization, institutions, or contributors names based on entity type.
        """
        if entity == 'organization':
            organizations = []
            for value in options:
                val = self.try_contributor_type(value, self.NOT_PEOPLE_TYPES)
                if val:
                    if field == 'creator':
                        organizations.append(val[field + 'Name'])
                    else:
                        organizations.append(val)
                elif (value[field + 'Name'] and not self.list_in_string(
                        value[field + 'Name'], self.INSTITUTION_KEYWORDS)
                      and self.list_in_string(value[field + 'Name'],
                                              self.ORGANIZATION_KEYWORDS)):
                    if field == 'creator':
                        organizations.append(value[field + 'Name'])
                    else:
                        organizations.append(value)

            return organizations
        elif entity == 'institution':
            institutions = []
            for value in options:
                val = self.try_contributor_type(value, self.NOT_PEOPLE_TYPES)
                if val:
                    institutions.append(val)
                elif (value[field + 'Name'] and self.list_in_string(
                        value[field + 'Name'], self.INSTITUTION_KEYWORDS)):
                    institutions.append(value)

            return institutions
        elif entity == 'contributor':
            people = []
            for value in options:
                val = self.try_contributor_type(value, self.PEOPLE_TYPES)
                if val:
                    people.append(val)
                elif (value[field + 'Name'] and not self.list_in_string(
                        value[field + 'Name'], self.INSTITUTION_KEYWORDS)
                      and not self.list_in_string(value[field + 'Name'],
                                                  self.ORGANIZATION_KEYWORDS)):
                    people.append(value)
            return people
        elif entity == 'funder':
            funders = []
            for value in options:
                val = self.try_contributor_type(value, ['Funder'])
                if val:
                    funders.append(val)
            return funders
        else:
            return options

    def try_contributor_type(self, value, target_list_types):
        try:
            contrib_type_item = value['@contributorType']
            if contrib_type_item in target_list_types:
                return value
            return None
        except KeyError:
            return None

    def list_in_string(self, string, list_):
        for word in list_:
            if isinstance(word, str):
                if word in string.lower():
                    return True
            else:
                if word.search(string):
                    return True
        return False

Example #6

Show file

class CreativeWork(Parser):
    '''
    Documentation for Datacite's metadata:
    https://schema.labs.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf
    '''
    def get_schema(self, type):
        return {
            'dataset': 'DataSet',
            'software': 'Software',
            'text/book': 'Book',
            'text/book chapter': 'Book',
            'text/book prospectus': 'Book',
            'text/book series': 'Book',
            'text/conference abstract': 'ConferencePaper',
            'text/conference paper': 'ConferencePaper',
            'text/conference poster': 'Poster',
            'text/dissertation': 'Dissertation',
            'text/edited book': 'Book',
            'text/journal article': 'Article',
            'text/journal issue': 'Article',
            'text/patent': 'Patent',
            'text/report': 'Report',
            'text/supervised student publication': 'Thesis',
            'text/working paper': 'WorkingPaper'

            # 'audiovisual': '',
            # 'collection': '',
            # 'event': '',
            # 'image': '',
            # 'interactiveresource': '',
            # 'model': '',
            # 'physicalobject': '',
            # 'service': '',
            # 'sound': '',
            # 'text15': '',
            # 'workflow': '',
            # 'text/book review': '',
            # 'text/conference program': '',
            # 'text/dictionary entry': '',
            # 'text/disclosure': '',
            # 'text/encyclopedia entry': '',
            # 'text/Funding submission': '',
            # 'text/license': '',
            # 'text/magazine article': '',
            # 'text/manual': '',
            # 'text/newsletter article': '',
            # 'text/newspaper article': '',
            # 'text/online resource': '',
            # 'text/registered copyright': '',
            # 'text/research tool': '',
            # 'text/tenure-promotion': '',
            # 'text/test': '',
            # 'text/trademark': '',
            # 'text/translation': '',
            # 'text/university academic unit': '',
            # 'text/website': '',
        }.get(type.lower()) or 'CreativeWork'

    schema = tools.RunPython(
        'get_schema',
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  resourceType['@resourceTypeGeneral'],
                  default='CreativeWork'))

    title = tools.RunPython(
        force_text,
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title))
    description = tools.RunPython(
        force_text,
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  descriptions.description[0]))

    rights = tools.Try(
        tools.Join(
            tools.RunPython(
                'text_list',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.rightsList.rights))))

    language = tools.ParseLanguage(
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.language))

    related_agents = tools.Concat(
        tools.Map(
            tools.Delegate(CreatorRelation),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          creators.creator))),
        tools.Map(
            tools.Delegate(ContributorRelation),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          contributors.contributor))),
        tools.Map(
            tools.Delegate(PublisherRelation),
            tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                      publisher)),
        tools.Map(
            tools.Delegate(HostRelation),
            tools.RunPython(
                get_contributors,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                ['HostingInstitution'])),
        # v.3 Funder is a contributor type
        # v.4 FundingReference replaces funder contributor type
        tools.Map(
            tools.Delegate(FunderRelation),
            tools.RunPython(
                get_contributors,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                ['Funder'])),
        tools.Map(
            tools.Delegate(FunderRelation),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          fundingReference))))

    # v.4 New, free text, 'subjectScheme' attribute on subject
    subjects = tools.Map(
        tools.Delegate(ThroughSubjects),
        tools.Subjects(
            tools.RunPython(
                'text_list',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.subjects.subject), ))))

    tags = tools.Map(
        tools.Delegate(ThroughTags),
        tools.RunPython(
            force_text,
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx.record, 'metadata')['oai_datacite'],
                    'type'),
                tools.RunPython('text_list', (tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.subjects.subject)))),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          formats.format),
                tools.Try(
                    ctx.record.metadata['oai_datacite'].datacentreSymbol),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['#text']),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['@resourceTypeGeneral']),
                tools.Maybe(ctx.record.header, 'setSpec'),
                tools.Maybe(ctx.record.header, '@status'))))

    identifiers = tools.Concat(
        tools.Map(
            tools.Delegate(WorkIdentifier),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          identifier))),
        tools.Map(
            tools.Delegate(WorkIdentifier),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          alternateIdentifiers.alternateidentifier))))

    related_works = tools.Concat(
        tools.Map(
            tools.Delegate(WorkRelation),
            tools.RunPython(
                get_related_works,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.relatedIdentifiers.relatedIdentifier)),
                False)),
        tools.Map(
            tools.Delegate(InverseWorkRelation),
            tools.RunPython(
                get_related_works,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.relatedIdentifiers.relatedIdentifier)),
                True)))

    date_updated = tools.ParseDate(tools.Try(ctx.record.header.datestamp))
    date_published = tools.ParseDate(
        tools.Try(
            tools.RunPython(
                'get_date_type',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.dates.date), 'Issued')))
    free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                  resource.rightsList.rights['@rightsURI'])
    free_to_read_date = tools.ParseDate(
        tools.Try(
            tools.RunPython(
                'get_date_type',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.dates.date), 'Available')))

    is_deleted = tools.RunPython('check_status',
                                 tools.Try(ctx.record.header['@status']))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        status = tools.Try(ctx.record.header['@status'])

        datestamp = tools.ParseDate(ctx.record.header.datestamp)

        set_spec = tools.Try(ctx.record.header.setSpec)

        is_reference_quality = tools.Try(
            ctx.record.metadata['oai_datacite'].isReferenceQuality)

        schema_version = tools.Try(
            ctx.record.metadata['oai_datacite'].schemaVersion)

        datacentre_symbol = tools.Try(
            ctx.record.metadata['oai_datacite'].datacentreSymbol)

        identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.identifier)

        alternate_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            alternateIdentifiers.alternateidentifier)

        titles = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title)

        publisher = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher)

        publication_year = tools.Try(ctx.record.metadata['oai_datacite'].
                                     payload.resource.publicationYear)

        subject = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                            resource.subjects.subject)

        resourceType = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.resourceType)

        sizes = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.size)

        format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                resource.formats.format)

        version = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.version)

        rights = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rights)

        rightsList = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rightsList)

        related_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            relatedIdentifiers.relatedIdentifier)

        description = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.descriptions)

        dates = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.dates.date)

        contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                 resource.contributors.contributor)

        creators = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.creators)

        # v.4 new property geoLocationPolygon, in addition to geoLocationPoint and geoLocationBox
        geolocations = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.geoLocations)

        funding_reference = tools.Try(ctx.record.metadata['oai_datacite'].
                                      payload.resource.fundingReference)

    def check_status(self, status):
        if status == 'deleted':
            return True
        return False

    def get_date_type(self, date_obj, date_type):
        date = None
        for obj in date_obj:
            if obj['@dateType'] == date_type:
                date = obj['#text']
        if date and date != '0000':
            return date
        # raise KeyError to break TryLink
        raise KeyError()

    def text_list(self, data):
        text_list = []
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    if '#text' in item:
                        text_list.append(item['#text'])
                        continue
                elif isinstance(item, str):
                    text_list.append(item)
                    continue
                logger.warning(
                    '#text is not in {} and it is not a string'.format(item))
            return text_list
        else:
            raise Exception('{} is not a list.'.format(data))