Ejemplo n.º 1
0
class CreativeWork(Parser):
    title = tools.Join(ctx.record.metadata['DIF']['Entry_Title'])
    description = tools.Maybe(ctx.record.metadata['DIF']['Summary'], 'Abstract')

    organizations = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Organization))),
        tools.Maybe(ctx.record.metadata['DIF'], 'Data_Center')
    )

    tags = tools.Map(
        tools.Delegate(ThroughTags),
        tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Name'),
        tools.Maybe(ctx.record.header, 'setSpec')
    )

    links = tools.Map(
        tools.Delegate(ThroughLinks),
        tools.Maybe(ctx.record.metadata['DIF'], 'Related_URL')
    )

    date_updated = tools.ParseDate(ctx.record.header.datestamp)

    class Extra:
        entry_id = ctx.record.metadata['DIF']['Entry_ID']

        metadata_name = tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Name')

        metadata_version = tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Version')

        last_dif_revision_date = tools.Maybe(ctx.record.metadata['DIF'], 'Last_DIF_Revision_Date')

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')
Ejemplo n.º 2
0
    class Extra:
        entry_id = ctx.record.metadata['DIF']['Entry_ID']

        metadata_name = tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Name')

        metadata_version = tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Version')

        last_dif_revision_date = tools.Maybe(ctx.record.metadata['DIF'], 'Last_DIF_Revision_Date')

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')
Ejemplo n.º 3
0
class Project(Parser):
    title = ctx.attributes.title
    description = ctx.attributes.description
    contributors = tools.Map(tools.Delegate(Contributor), ctx['contributors'])
    institutions = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Institution))),
        ctx.embeds.affiliated_institutions.data
    )
    date_updated = tools.ParseDate(ctx.attributes.date_modified)
    tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags)
    rights = tools.Maybe(ctx, 'attributes.node_license')
    links = tools.Map(tools.Delegate(ThroughLinks), ctx.links.html)

    class Extra:
        date_created = tools.ParseDate(ctx.attributes.date_created)
        files = ctx.relationships.files.links.related.href
        parent = tools.Maybe(ctx, 'relationships.parent.links.related.href')
        forks = ctx.relationships.forks.links.related.href
        root = ctx.relationships.root.links.related.href
        comments = ctx.relationships.comments.links.related.href
        registrations = ctx.relationships.registrations.links.related.href
        logs = ctx.relationships.logs.links.related.href
        node_links = ctx.relationships.node_links.links.related.href
        wikis = ctx.relationships.wikis.links.related.href
        children = ctx.relationships.children.links.related.href
        fork = ctx.attributes.fork
        date_modified = ctx.attributes.date_modified
        collection = ctx.attributes.collection
        registration = ctx.attributes.registration
        type = ctx.type
        id = ctx.id
Ejemplo n.º 4
0
class Person(Parser):
    given_name = tools.ParseName(ctx.name).first
    family_name = tools.ParseName(ctx.name).last
    additional_name = tools.ParseName(ctx.name).middle
    suffix = tools.ParseName(ctx.name).suffix
    affiliations = tools.Map(
        tools.Delegate(Affiliation.using(entity=tools.Delegate(Organization))),
        tools.Maybe(ctx, 'arxiv:affiliation'))
Ejemplo n.º 5
0
class Organization(Parser):
    ORGANIZATION_KEYWORDS = (
        'the',
        'center'
    )

    name = tools.RunPython('combine_name', ctx)
    url = tools.Maybe(ctx, 'Data_Center_URL')
    # TODO: handle when personnel are organizations
    affiliations = tools.Map(
        tools.Delegate(Affiliation),
        tools.RunPython(
            'get_personnel',
            tools.Maybe(ctx, 'Personnel'),
            'person'
        )
    )

    def combine_name(self, ctx):
        return ctx['Data_Center_Name']['Short_Name'] + ' ' + ctx['Data_Center_Name']['Long_Name']

    def get_personnel(self, options, entity):
        """
        Returns list based on entity type.
        """
        if not isinstance(options, list):
            options = [options]

        if entity == 'person':
            people = [
                value for value in options if
                (
                    not self.list_in_string(value['First_Name'], self.ORGANIZATION_KEYWORDS) and
                    not self.list_in_string(value['Last_Name'], self.ORGANIZATION_KEYWORDS)
                )
            ]
            return people
        else:
            return options

    def list_in_string(self, string, list_):
        if any(word in string.lower() for word in list_):
            return True
        return False
Ejemplo n.º 6
0
class Preprint(Parser):
    title = ctx.entry.title
    description = ctx.entry.summary
    date_published = tools.ParseDate(ctx.entry.published)
    date_updated = tools.ParseDate(ctx.entry.updated)
    contributors = tools.Map(tools.Delegate(Contributor), ctx.entry.author)
    links = tools.Map(tools.Delegate(ThroughLinks),
                      tools.Maybe(ctx.entry, 'arxiv:doi'), ctx.entry.id)
    subject = tools.Delegate(Tag, ctx.entry['arxiv:primary_category'])
    tags = tools.Map(tools.Delegate(ThroughTags), ctx.entry.category)

    class Extra:
        resource_id = ctx.entry.id
        journal_ref = tools.Maybe(ctx.entry, 'arxiv:journal_ref')
        comment = tools.Maybe(ctx.entry, 'arxiv:comment')
Ejemplo n.º 7
0
 class Extra:
     date_created = tools.ParseDate(ctx.attributes.date_created)
     files = ctx.relationships.files.links.related.href
     parent = tools.Maybe(ctx, 'relationships.parent.links.related.href')
     forks = ctx.relationships.forks.links.related.href
     root = ctx.relationships.root.links.related.href
     comments = ctx.relationships.comments.links.related.href
     registrations = ctx.relationships.registrations.links.related.href
     logs = ctx.relationships.logs.links.related.href
     node_links = ctx.relationships.node_links.links.related.href
     wikis = ctx.relationships.wikis.links.related.href
     children = ctx.relationships.children.links.related.href
     fork = ctx.attributes.fork
     date_modified = ctx.attributes.date_modified
     collection = ctx.attributes.collection
     registration = ctx.attributes.registration
     type = ctx.type
     id = ctx.id
Ejemplo n.º 8
0
class OAICreativeWork(Parser):
    schema = 'CreativeWork'

    ORGANIZATION_KEYWORDS = ('the', 'center')
    INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'institute')

    title = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(
                            ctx['record']['metadata']['dc']['dc:title'])))
    description = tools.Join(
        tools.RunPython('force_text',
                        tools.Try(ctx.record.metadata.dc['dc:description'])))

    publishers = tools.Map(
        tools.Delegate(
            OAIAssociation.using(entity=tools.Delegate(OAIPublisher))),
        tools.Map(tools.RunPython('force_text'),
                  tools.Try(ctx.record.metadata.dc['dc:publisher'])))

    rights = tools.Join(
        tools.Maybe(tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:rights'))

    # Note: this is only taking the first language in the case of multiple languages
    language = tools.ParseLanguage(
        tools.Try(ctx['record']['metadata']['dc']['dc:language'][0]), )

    contributors = tools.Map(
        tools.Delegate(OAIContributor),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:creator'),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:contributor')), 'contributor'))

    institutions = tools.Map(
        tools.Delegate(
            OAIAssociation.using(entity=tools.Delegate(OAIInstitution))),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:creator'),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:contributor')), 'institution'))

    organizations = tools.Map(
        tools.Delegate(
            OAIAssociation.using(entity=tools.Delegate(OAIOrganization))),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:creator'),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:contributor')), 'organization'))

    tags = tools.Map(
        tools.Delegate(OAIThroughTags),
        tools.RunPython(
            'force_text',
            tools.Concat(
                tools.Try(ctx['record']['header']['setSpec']),
                tools.Try(ctx['record']['metadata']['dc']['dc:type']),
                tools.Try(ctx['record']['metadata']['dc']['dc:format']),
                tools.Try(ctx['record']['metadata']['dc']['dc:subject']),
            )))

    links = tools.Map(
        tools.Delegate(OAIThroughLinks),
        tools.RunPython(
            'get_links',
            tools.Concat(
                tools.Try(ctx['record']['metadata']['dc']['dc:identifier']),
                tools.Maybe(
                    tools.Maybe(ctx['record'], 'metadata')['dc'],
                    'dc:relation'))))

    date_updated = tools.ParseDate(ctx['record']['header']['datestamp'])

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        # An entity responsible for making contributions to the resource.
        contributor = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')

        # The spatial or temporal topic of the resource, the spatial applicability of the resource,
        # or the jurisdiction under which the resource is relevant.
        coverage = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage')

        # An entity primarily responsible for making the resource.
        creator = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator')

        # A point or period of time associated with an event in the lifecycle of the resource.
        dates = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date')

        # The file format, physical medium, or dimensions of the resource.
        resource_format = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format')

        # An unambiguous reference to the resource within a given context.
        identifiers = tools.Concat(
            tools.Try(ctx['record']['metadata']['dc']['dc:identifier']),
            tools.Maybe(ctx['record']['header'], 'identifier'))

        # A related resource.
        relation = tools.RunPython('get_relation', ctx)

        # A related resource from which the described resource is derived.
        source = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source')

        # The topic of the resource.
        subject = tools.Try(ctx.record.metadata.dc['dc:subject'])

        # The nature or genre of the resource.
        resource_type = tools.Try(ctx.record.metadata.dc['dc:type'])

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')

        # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage
        language = tools.Try(ctx.record.metadata.dc['dc:language'])

        # Status in the header, will exist if the resource is deleted
        status = tools.Maybe(ctx.record.header, '@status')

    def get_links(self, ctx):
        links = []
        for link in ctx:
            if not link or not isinstance(link, str):
                continue
            found_url = URL_REGEX.search(link)
            if found_url is not None:
                links.append(found_url.group())
                continue

            found_doi = DOI_REGEX.search(link)
            if found_doi is not None:
                found_doi = found_doi.group()
                if 'dx.doi.org' in found_doi:
                    links.append(found_doi)
                else:
                    links.append('http://dx.doi.org/{}'.format(
                        found_doi.replace('doi:', '')))
        return links

    def force_text(self, data):
        if isinstance(data, dict):
            return data['#text']

        if isinstance(data, str):
            return data

        fixed = []
        for datum in (data or []):
            if datum is None:
                continue
            if isinstance(datum, dict):
                if '#text' not in datum:
                    logger.warn('Skipping %s, no #text key exists', datum)
                    continue
                fixed.append(datum['#text'])
            elif isinstance(datum, str):
                fixed.append(datum)
            else:
                raise Exception(datum)
        return fixed

    def get_relation(self, ctx):
        if not ctx['record'].get('metadata'):
            return []
        relation = ctx['record']['metadata']['dc'].get('dc:relation', [])
        if isinstance(relation, dict):
            return relation['#text']
        return relation

    def get_contributors(self, options, entity):
        """
        Returns list of organization, institutions, or contributors names based on entity type.
        """
        options = [o if isinstance(o, str) else o['#text'] for o in options]

        if entity == 'organization':
            organizations = [
                value for value in options
                if (value and
                    not self.list_in_string(value, self.INSTITUTION_KEYWORDS)
                    and self.list_in_string(value, self.ORGANIZATION_KEYWORDS))
            ]
            return organizations
        elif entity == 'institution':
            institutions = [
                value for value in options
                if (value
                    and self.list_in_string(value, self.INSTITUTION_KEYWORDS))
            ]
            return institutions
        elif entity == 'contributor':
            people = [
                value for value in options
                if (value and not self.list_in_string(
                    value, self.INSTITUTION_KEYWORDS) and
                    not self.list_in_string(value, self.ORGANIZATION_KEYWORDS))
            ]
            return people
        else:
            return options

    def list_in_string(self, string, list_):
        if any(word in string.lower() for word in list_):
            return True
        return False
Ejemplo n.º 9
0
    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        # An entity responsible for making contributions to the resource.
        contributor = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')

        # The spatial or temporal topic of the resource, the spatial applicability of the resource,
        # or the jurisdiction under which the resource is relevant.
        coverage = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage')

        # An entity primarily responsible for making the resource.
        creator = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator')

        # A point or period of time associated with an event in the lifecycle of the resource.
        dates = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date')

        # The file format, physical medium, or dimensions of the resource.
        resource_format = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format')

        # An unambiguous reference to the resource within a given context.
        identifiers = tools.Concat(
            tools.Try(ctx['record']['metadata']['dc']['dc:identifier']),
            tools.Maybe(ctx['record']['header'], 'identifier'))

        # A related resource.
        relation = tools.RunPython('get_relation', ctx)

        # A related resource from which the described resource is derived.
        source = tools.Maybe(
            tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source')

        # The topic of the resource.
        subject = tools.Try(ctx.record.metadata.dc['dc:subject'])

        # The nature or genre of the resource.
        resource_type = tools.Try(ctx.record.metadata.dc['dc:type'])

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')

        # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage
        language = tools.Try(ctx.record.metadata.dc['dc:language'])

        # Status in the header, will exist if the resource is deleted
        status = tools.Maybe(ctx.record.header, '@status')
Ejemplo n.º 10
0
    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        status = tools.Maybe(ctx.record.header, '@status')

        datestamp = tools.ParseDate(ctx.record.header.datestamp)

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')

        is_reference_quality = tools.Try(
            ctx.record.metadata['oai_datacite'].isReferenceQuality)

        schema_version = tools.Try(
            ctx.record.metadata['oai_datacite'].schemaVersion)

        datacentre_symbol = tools.Try(
            ctx.record.metadata['oai_datacite'].datacentreSymbol)

        identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.identifier)

        alternate_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            alternateIdentifiers.alternateidentifier)

        titles = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title)

        publisher = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher)

        publication_year = tools.Try(ctx.record.metadata['oai_datacite'].
                                     payload.resource.publicationYear)

        subject = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                            resource.subjects.subject)

        resourceType = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.resourceType)

        sizes = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.size)

        format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                resource.formats.format)

        version = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.version)

        rights = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rights)

        rightsList = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rightsList)

        related_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            relatedIdentifiers.relatedIdentifier)

        description = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.descriptions)

        dates = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.dates.date)

        contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                 resource.contributors.contributor)

        creators = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.creators)

        geolocations = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.geoLocations)
Ejemplo n.º 11
0
class CreativeWork(Parser):

    # Schema definitions: http://schema.datacite.org/meta/kernel-3.1/doc/DataCite-MetadataKernel_v3.1.pdf

    PEOPLE_TYPES = ('ContactPerson', 'DataCurator', 'Editor', 'ProjectLeader',
                    'ProjectManager', 'ProjectMember', 'RelatedPerson',
                    'Researcher', 'Supervisor', 'WorkPackageLeader')
    CHECK_TYPES = ('DataCollector', 'DataManager', 'Producer', 'RightsHolder',
                   'Sponsor', 'Other')

    NOT_PEOPLE_TYPES = ('Distributor', 'HostingInstitution',
                        'RegistrationAgency', 'RegistrationAuthority',
                        'ResearchGroup')

    ORGANIZATION_KEYWORDS = (THE_REGEX, 'council', 'center', 'foundation')
    INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'college',
                            'institute')

    title = tools.RunPython(
        force_text,
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title))
    description = tools.RunPython(
        force_text,
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  descriptions.description[0]))

    publishers = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Publisher))),
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher))

    rights = tools.Try(
        tools.Join(
            tools.RunPython(
                'text_list',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.rightsList.rights))))

    language = tools.ParseLanguage(
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.language))

    contributors = tools.Concat(
        tools.Map(
            tools.Delegate(Creator),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.creators.creator)), 'contributor',
                'creator')),
        tools.Map(
            tools.Delegate(Contributor),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                'contributor', 'contributor')))

    institutions = tools.Concat(
        tools.Map(
            tools.Delegate(
                Association.using(entity=tools.Delegate(CreatorInstitution))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.creators.creator)), 'institution',
                'creator')),
        tools.Map(
            tools.Delegate(
                Association.using(
                    entity=tools.Delegate(ContributorInstitution))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                'institution', 'contributor')))

    organizations = tools.Concat(
        tools.Map(
            tools.Delegate(
                Association.using(entity=tools.Delegate(CreatorOrganization))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.creators.creator)), 'organization',
                'creator')),
        tools.Map(
            tools.Delegate(
                Association.using(
                    entity=tools.Delegate(ContributorOrganization))),
            tools.RunPython(
                'get_contributors',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                'organization', 'contributor')))

    tags = tools.Map(
        tools.Delegate(ThroughTags),
        tools.RunPython(
            force_text,
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx.record, 'metadata')['oai_datacite'],
                    'type'),
                tools.RunPython('text_list', (tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.subjects.subject)))),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          formats.format),
                tools.Try(
                    ctx.record.metadata['oai_datacite'].datacentreSymbol),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['#text']),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['@resourceTypeGeneral']),
                tools.Maybe(ctx.record.header, 'setSpec'),
                tools.Maybe(ctx.record.header, '@status'))))

    links = tools.Concat(
        tools.Map(
            tools.Delegate(ThroughLinks),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          identifier))),
        tools.Map(
            tools.Delegate(ThroughAlternateLinks),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          alternateIdentifiers.alternateidentifier))),
        tools.Map(
            tools.Delegate(ThroughRelatedLinks),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          relatedIdentifiers.relatedIdentifier))))

    date_updated = tools.ParseDate(ctx.record.header.datestamp)
    date_published = tools.ParseDate(
        tools.RunPython(
            'get_date_type',
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          dates.date)), 'Issued'))
    free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                  resource.rightsList.rights['@rightsURI'])
    free_to_read_date = tools.ParseDate(
        tools.RunPython(
            'get_date_type',
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          dates.date)), 'Available'))
    funders = tools.Map(
        tools.Delegate(Association.using(entity=tools.Delegate(Funder))),
        tools.RunPython(
            'get_contributors',
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          contributors.contributor)), 'funder', 'contributor'))
    venues = tools.Map(
        tools.Delegate(ThroughVenues),
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  geoLocations.geoLocation))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        status = tools.Maybe(ctx.record.header, '@status')

        datestamp = tools.ParseDate(ctx.record.header.datestamp)

        set_spec = tools.Maybe(ctx.record.header, 'setSpec')

        is_reference_quality = tools.Try(
            ctx.record.metadata['oai_datacite'].isReferenceQuality)

        schema_version = tools.Try(
            ctx.record.metadata['oai_datacite'].schemaVersion)

        datacentre_symbol = tools.Try(
            ctx.record.metadata['oai_datacite'].datacentreSymbol)

        identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.identifier)

        alternate_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            alternateIdentifiers.alternateidentifier)

        titles = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title)

        publisher = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher)

        publication_year = tools.Try(ctx.record.metadata['oai_datacite'].
                                     payload.resource.publicationYear)

        subject = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                            resource.subjects.subject)

        resourceType = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.resourceType)

        sizes = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.size)

        format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                resource.formats.format)

        version = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.version)

        rights = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rights)

        rightsList = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rightsList)

        related_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            relatedIdentifiers.relatedIdentifier)

        description = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.descriptions)

        dates = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.dates.date)

        contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                 resource.contributors.contributor)

        creators = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.creators)

        geolocations = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.geoLocations)

    def get_date_type(self, date_obj, date_type):
        try:
            date = None
            for obj in date_obj:
                if obj['@dateType'] == date_type:
                    date = obj['#text']
            if date and not DATE_REGEX.search(date):
                return None
        except KeyError:
            return None
        return date

    def text_list(self, data):
        text_list = []
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    if '#text' in item:
                        text_list.append(item['#text'])
                        continue
                elif isinstance(item, str):
                    text_list.append(item)
                    continue
                logger.warning(
                    '#text is not in {} and it is not a string'.format(item))
            return text_list
        else:
            raise Exception('{} is not a list.'.format(data))

    def get_contributors(self, options, entity, field=None):
        """
        Returns list of organization, institutions, or contributors names based on entity type.
        """
        if entity == 'organization':
            organizations = []
            for value in options:
                val = self.try_contributor_type(value, self.NOT_PEOPLE_TYPES)
                if val:
                    if field == 'creator':
                        organizations.append(val[field + 'Name'])
                    else:
                        organizations.append(val)
                elif (value[field + 'Name'] and not self.list_in_string(
                        value[field + 'Name'], self.INSTITUTION_KEYWORDS)
                      and self.list_in_string(value[field + 'Name'],
                                              self.ORGANIZATION_KEYWORDS)):
                    if field == 'creator':
                        organizations.append(value[field + 'Name'])
                    else:
                        organizations.append(value)

            return organizations
        elif entity == 'institution':
            institutions = []
            for value in options:
                val = self.try_contributor_type(value, self.NOT_PEOPLE_TYPES)
                if val:
                    institutions.append(val)
                elif (value[field + 'Name'] and self.list_in_string(
                        value[field + 'Name'], self.INSTITUTION_KEYWORDS)):
                    institutions.append(value)

            return institutions
        elif entity == 'contributor':
            people = []
            for value in options:
                val = self.try_contributor_type(value, self.PEOPLE_TYPES)
                if val:
                    people.append(val)
                elif (value[field + 'Name'] and not self.list_in_string(
                        value[field + 'Name'], self.INSTITUTION_KEYWORDS)
                      and not self.list_in_string(value[field + 'Name'],
                                                  self.ORGANIZATION_KEYWORDS)):
                    people.append(value)
            return people
        elif entity == 'funder':
            funders = []
            for value in options:
                val = self.try_contributor_type(value, ['Funder'])
                if val:
                    funders.append(val)
            return funders
        else:
            return options

    def try_contributor_type(self, value, target_list_types):
        try:
            contrib_type_item = value['@contributorType']
            if contrib_type_item in target_list_types:
                return value
            return None
        except KeyError:
            return None

    def list_in_string(self, string, list_):
        for word in list_:
            if isinstance(word, str):
                if word in string.lower():
                    return True
            else:
                if word.search(string):
                    return True
        return False
Ejemplo n.º 12
0
class CreativeWork(Parser):
    '''
    Documentation for Datacite's metadata:
    https://schema.labs.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf
    '''
    def get_schema(self, type):
        return {
            'dataset': 'DataSet',
            'software': 'Software',
            'text/book': 'Book',
            'text/book chapter': 'Book',
            'text/book prospectus': 'Book',
            'text/book series': 'Book',
            'text/conference abstract': 'ConferencePaper',
            'text/conference paper': 'ConferencePaper',
            'text/conference poster': 'Poster',
            'text/dissertation': 'Dissertation',
            'text/edited book': 'Book',
            'text/journal article': 'Article',
            'text/journal issue': 'Article',
            'text/patent': 'Patent',
            'text/report': 'Report',
            'text/supervised student publication': 'Thesis',
            'text/working paper': 'WorkingPaper'

            # 'audiovisual': '',
            # 'collection': '',
            # 'event': '',
            # 'image': '',
            # 'interactiveresource': '',
            # 'model': '',
            # 'physicalobject': '',
            # 'service': '',
            # 'sound': '',
            # 'text15': '',
            # 'workflow': '',
            # 'text/book review': '',
            # 'text/conference program': '',
            # 'text/dictionary entry': '',
            # 'text/disclosure': '',
            # 'text/encyclopedia entry': '',
            # 'text/Funding submission': '',
            # 'text/license': '',
            # 'text/magazine article': '',
            # 'text/manual': '',
            # 'text/newsletter article': '',
            # 'text/newspaper article': '',
            # 'text/online resource': '',
            # 'text/registered copyright': '',
            # 'text/research tool': '',
            # 'text/tenure-promotion': '',
            # 'text/test': '',
            # 'text/trademark': '',
            # 'text/translation': '',
            # 'text/university academic unit': '',
            # 'text/website': '',
        }.get(type.lower()) or 'CreativeWork'

    schema = tools.RunPython(
        'get_schema',
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  resourceType['@resourceTypeGeneral'],
                  default='CreativeWork'))

    title = tools.RunPython(
        force_text,
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title))
    description = tools.RunPython(
        force_text,
        tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                  descriptions.description[0]))

    rights = tools.Try(
        tools.Join(
            tools.RunPython(
                'text_list',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.rightsList.rights))))

    language = tools.ParseLanguage(
        tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.language))

    related_agents = tools.Concat(
        tools.Map(
            tools.Delegate(CreatorRelation),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          creators.creator))),
        tools.Map(
            tools.Delegate(ContributorRelation),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          contributors.contributor))),
        tools.Map(
            tools.Delegate(PublisherRelation),
            tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                      publisher)),
        tools.Map(
            tools.Delegate(HostRelation),
            tools.RunPython(
                get_contributors,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                ['HostingInstitution'])),
        # v.3 Funder is a contributor type
        # v.4 FundingReference replaces funder contributor type
        tools.Map(
            tools.Delegate(FunderRelation),
            tools.RunPython(
                get_contributors,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.contributors.contributor)),
                ['Funder'])),
        tools.Map(
            tools.Delegate(FunderRelation),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          fundingReference))))

    # v.4 New, free text, 'subjectScheme' attribute on subject
    subjects = tools.Map(
        tools.Delegate(ThroughSubjects),
        tools.Subjects(
            tools.RunPython(
                'text_list',
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.subjects.subject), ))))

    tags = tools.Map(
        tools.Delegate(ThroughTags),
        tools.RunPython(
            force_text,
            tools.Concat(
                tools.Maybe(
                    tools.Maybe(ctx.record, 'metadata')['oai_datacite'],
                    'type'),
                tools.RunPython('text_list', (tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.subjects.subject)))),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          formats.format),
                tools.Try(
                    ctx.record.metadata['oai_datacite'].datacentreSymbol),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['#text']),
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          resourceType['@resourceTypeGeneral']),
                tools.Maybe(ctx.record.header, 'setSpec'),
                tools.Maybe(ctx.record.header, '@status'))))

    identifiers = tools.Concat(
        tools.Map(
            tools.Delegate(WorkIdentifier),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          identifier))),
        tools.Map(
            tools.Delegate(WorkIdentifier),
            tools.Concat(
                tools.Try(ctx.record.metadata['oai_datacite'].payload.resource.
                          alternateIdentifiers.alternateidentifier))))

    related_works = tools.Concat(
        tools.Map(
            tools.Delegate(WorkRelation),
            tools.RunPython(
                get_related_works,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.relatedIdentifiers.relatedIdentifier)),
                False)),
        tools.Map(
            tools.Delegate(InverseWorkRelation),
            tools.RunPython(
                get_related_works,
                tools.Concat(
                    tools.Try(ctx.record.metadata['oai_datacite'].payload.
                              resource.relatedIdentifiers.relatedIdentifier)),
                True)))

    date_updated = tools.ParseDate(tools.Try(ctx.record.header.datestamp))
    date_published = tools.ParseDate(
        tools.Try(
            tools.RunPython(
                'get_date_type',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.dates.date), 'Issued')))
    free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                  resource.rightsList.rights['@rightsURI'])
    free_to_read_date = tools.ParseDate(
        tools.Try(
            tools.RunPython(
                'get_date_type',
                tools.Concat(ctx.record.metadata['oai_datacite'].payload.
                             resource.dates.date), 'Available')))

    is_deleted = tools.RunPython('check_status',
                                 tools.Try(ctx.record.header['@status']))

    class Extra:
        """
        Fields that are combined in the base parser are relisted as singular elements that match
        their original entry to preserve raw data structure.
        """
        status = tools.Try(ctx.record.header['@status'])

        datestamp = tools.ParseDate(ctx.record.header.datestamp)

        set_spec = tools.Try(ctx.record.header.setSpec)

        is_reference_quality = tools.Try(
            ctx.record.metadata['oai_datacite'].isReferenceQuality)

        schema_version = tools.Try(
            ctx.record.metadata['oai_datacite'].schemaVersion)

        datacentre_symbol = tools.Try(
            ctx.record.metadata['oai_datacite'].datacentreSymbol)

        identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.identifier)

        alternate_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            alternateIdentifiers.alternateidentifier)

        titles = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.titles.title)

        publisher = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.publisher)

        publication_year = tools.Try(ctx.record.metadata['oai_datacite'].
                                     payload.resource.publicationYear)

        subject = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                            resource.subjects.subject)

        resourceType = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.resourceType)

        sizes = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.size)

        format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                resource.formats.format)

        version = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.version)

        rights = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rights)

        rightsList = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.rightsList)

        related_identifiers = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.
            relatedIdentifiers.relatedIdentifier)

        description = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.descriptions)

        dates = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.dates.date)

        contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload.
                                 resource.contributors.contributor)

        creators = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.creators)

        # v.4 new property geoLocationPolygon, in addition to geoLocationPoint and geoLocationBox
        geolocations = tools.Try(
            ctx.record.metadata['oai_datacite'].payload.resource.geoLocations)

        funding_reference = tools.Try(ctx.record.metadata['oai_datacite'].
                                      payload.resource.fundingReference)

    def check_status(self, status):
        if status == 'deleted':
            return True
        return False

    def get_date_type(self, date_obj, date_type):
        date = None
        for obj in date_obj:
            if obj['@dateType'] == date_type:
                date = obj['#text']
        if date and date != '0000':
            return date
        # raise KeyError to break TryLink
        raise KeyError()

    def text_list(self, data):
        text_list = []
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    if '#text' in item:
                        text_list.append(item['#text'])
                        continue
                elif isinstance(item, str):
                    text_list.append(item)
                    continue
                logger.warning(
                    '#text is not in {} and it is not a string'.format(item))
            return text_list
        else:
            raise Exception('{} is not a list.'.format(data))
Ejemplo n.º 13
0
 class Extra:
     description = tools.Maybe(ctx, 'Description')
     url_content_type = tools.Maybe(ctx.URL_Content_Type, 'Type')
Ejemplo n.º 14
0
 class Extra:
     role = tools.Maybe(ctx, 'Role')
Ejemplo n.º 15
0
 class Extra:
     resource_id = ctx.entry.id
     journal_ref = tools.Maybe(ctx.entry, 'arxiv:journal_ref')
     comment = tools.Maybe(ctx.entry, 'arxiv:comment')