class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class Registration(Parser): title = ctx[FIELDS['title']] description = ctx[FIELDS['summary']] date_published = tools.ParseDate( ctx[FIELDS['registration date']].timestamp) date_updated = tools.ParseDate(ctx[FIELDS['registration date']].timestamp) related_agents = tools.Concat( tools.Delegate(PrincipalInvestigator, ctx[FIELDS['primary investigator']]), tools.Delegate(OtherInvestigator, ctx[FIELDS['other investigator']]), tools.Map( tools.Delegate(AdditionalInvestigator), tools.RunPython('split_names', ctx[FIELDS['additional investigators']]))) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.RunPython('get_link', ctx.id)) class Extra: registration_date = ctx[FIELDS['registration date']] questions_and_objectives = ctx[FIELDS['questions and objectives']] study_type = ctx[FIELDS['study type']] study_type_detail = ctx[FIELDS['study type other']] contact_details = ctx[FIELDS['contact details']] participating_institutions = ctx[FIELDS['participating institutions']] countries_of_recruitment = ctx[FIELDS['countries of recruitment']] funders = ctx[FIELDS['funders']] problems_studied = ctx[FIELDS['health conditions or problems studied']] patient_population = ctx[FIELDS['patient population']] interventions = ctx[FIELDS['interventions']] inclusion_criteria = ctx[FIELDS['inclusion criteria']] exclusion_criteria = ctx[FIELDS['exclusion criteria']] control_or_comparators = ctx[FIELDS['control or comparators']] primary_outcomes = ctx[FIELDS['primary outcomes']] key_secondary_outcomes = ctx[FIELDS['key secondary outcomes']] target_sample_size = ctx[FIELDS['target sample size']] recruitment_status = ctx[FIELDS['recruitment status']] other_recruitment_status = ctx[FIELDS['other recruitment status']] first_enrollment_date = ctx[FIELDS['first enrollment date']] expected_enrollment_completion_date = ctx[ FIELDS['expected enrollment completion date']] expected_research_completion_date = ctx[ FIELDS['expected research completion date']] ethical_approval = ctx[FIELDS['ethical approval']] ethical_approval_details = ctx[FIELDS['ethical approval details']] ethical_committee_judgment = ctx[FIELDS['ethical committee judgment']] data = ctx[FIELDS['data']] published_paper = ctx[FIELDS['published paper identifier']] study_website = ctx[FIELDS['study website']] study_results = ctx[FIELDS['study results']] def get_link(self, id): return LINK_FORMAT.format(id) def split_names(self, obj): if not obj: return None return obj.split(',')
class Preprint(Parser): title = tools.Try(ctx['DC.Title']) description = tools.Try(ctx['DC.Description']) # is_deleted date_published = tools.ParseDate(tools.Try(ctx['article:published_time'])) date_updated = tools.ParseDate(tools.Try(ctx['DC.Date'])) # free_to_read_type # free_to_read_date rights = tools.Try(ctx['DC.Rights']) language = tools.Try(ctx['DC.Language']) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Biology'), tools.Subjects(tools.Try(ctx['subject-areas']))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx['category']), tools.Try(ctx['subject-areas'])) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx['og:url']), ctx['citation_public_url'], ctx['citation_doi']) related_agents = tools.Concat( tools.Map(tools.Delegate(Publisher), tools.Try(ctx['DC.Publisher'])), tools.Map(tools.Delegate(Creator), tools.RunPython('get_contributors', ctx))) # related_works class Extra: identifiers = ctx['DC.Identifier'] access_rights = ctx['DC.AccessRights'] def get_contributors(self, link): authors = link.get('citation_author', []) if isinstance( link.get('citation_author', []), list) else [link['citation_author']] institutions = link.get( 'citation_author_institution', []) if isinstance( link.get('citation_author_institution', []), list) else [link['citation_author_institution']] emails = link.get('citation_author_email', []) if isinstance( link.get('citation_author_email', []), list) else [link['citation_author_email']] contribs = [] for author, email, institution in itertools.zip_longest( authors, emails, institutions): contrib = { 'author': author, 'institution': institution, 'email': email, } contribs.append(contrib) return contribs
class Extra: access_rights = tools.Try(ctx['access-rights']) usage_rights = tools.Try(ctx['usage-rights']) collection_statistics = tools.Try(ctx['collection-statistics']) management = tools.Try(ctx['management']) collection_type = tools.Try(ctx['collection-type']) last_update = tools.ParseDate(tools.Try(ctx['last-update']))
class Award(Parser): name = ctx.title description = ctx.fundsObligatedAmt award_amount = tools.Int(ctx.fundsObligatedAmt) date = tools.ParseDate(ctx.date) uri = tools.RunPython(format_url, ctx.id) class Extra: funds_obligated_amt = ctx.fundsObligatedAmt award_id = ctx.id transaction_type = ctx.transType estimated_total_amt = tools.Try(ctx.estimatedTotalAmt) catalog_of_federal_domestic_assistance_number = tools.Try( ctx.cfdaNumber) date = ctx.date date_start = tools.Try(ctx.startDate) date_expiration = tools.Try(ctx.expDate) awardee = tools.Try(ctx.awardee) awardee_address = tools.Try(ctx.awardeeAddress) awardee_name = ctx.awardeeName awardee_city = tools.Try(ctx.awardeeCity) awardee_county = tools.Try(ctx.awardeeCounty) awardee_state_code = tools.Try(ctx.awardeeStateCode) awardee_country_code = tools.Try(ctx.awardeeCountryCode) awardee_district_code = tools.Try(ctx.awardeeDistrictCode) awardee_zip_code = tools.Try(ctx.awardeeZipCode)
class Preprint(Parser): title = ctx.item['dc:title'] description = ctx.item.description date_published = tools.ParseDate(ctx.item['dc:date']) date_updated = tools.ParseDate(ctx.item['dc:date']) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Concat(tools.Static('Biology')) ) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.item['dc:identifier']) related_agents = tools.Concat( tools.Delegate(Publisher, ctx.item['dc:publisher']), tools.Map(tools.Delegate(Creator), ctx.item['dc:creator']), )
class CreativeWork(Parser): title = ctx.title description = tools.Try(ctx.description) is_deleted = tools.RunPython('_is_deleted', tools.Try(ctx.otherProperties)) date_updated = tools.ParseDate(tools.Try(ctx.providerUpdatedDateTime)) rights = tools.Join(tools.Try(ctx.licenses.uri)) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage(tools.Try(ctx.languages[0]), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), tools.Try(ctx.contributors)), tools.Map(tools.Delegate(Publisher), tools.Try(ctx.publisher)), tools.Map(tools.Delegate(Funder), tools.Try(ctx.sponsorships))) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Map( tools.IRI(), tools.RunPython( 'unique', tools.Concat(tools.Try(ctx.uris.canonicalUri), tools.Try(ctx.uris.providerUris), tools.Try(ctx.uris.descriptorUris), tools.Try(ctx.uris.objectUris))))) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Subjects(tools.Try(ctx.subjects))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.tags), tools.Try(ctx.subjects)) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version) def unique(self, items): return list(sorted(set(items))) def _is_deleted(self, properties): for prop in properties or []: if prop['name'] == 'status': return 'deleted' in prop['properties'].get('status', []) return False
class Registration(Parser): title = tools.Try(ctx['general-information']['title']) description = tools.Try(ctx['additional-trial-info']['abstract']) date_updated = tools.ParseDate(tools.Try(ctx['general-information']['last-updated'])) date_published = tools.ParseDate(tools.Try(ctx['general-information']['published-at'])) related_agents = tools.Map(tools.Delegate(Creator), tools.Try(ctx.pi)) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.IRI(ctx['general-information']['url'])), ) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython( process_keywords, tools.Try(ctx['additional-trial-info']['keywords']), ) ) ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Concat( tools.RunPython( process_keywords, tools.Try(ctx['additional-trial-info']['keywords']), ), tools.Try(ctx['additional-trial-info']['status']), tools.Try(ctx['additional-trial-info']['jel-code']) ) ) class Extra: general_information = tools.Try(ctx['general-information']) additional_trial_information = tools.Try(ctx['additional-trial-info']) publication_data = tools.Try(ctx['data-publication']) primary_investigator = tools.Try(ctx['pi']) interventions = tools.Try(ctx['interventions']) outcomes = tools.Try(ctx['outcomes']) experimental_design = tools.Try(ctx['experimental-design']) experimental_characteristics = tools.Try(ctx['experimental-characteristics']) supporting_document_material = tools.Try(ctx['supporting-doc-material']) post_trial = tools.Try(ctx['post-trial']) reports_papers = tools.Try(ctx['reports-papers'])
class Award(Parser): name = ctx.title description = ctx.fundsObligatedAmt award_amount = tools.Int(ctx.fundsObligatedAmt) date = tools.ParseDate(ctx.date) uri = tools.RunPython(format_url, ctx.id) class Extra: funds_obligated_amt = ctx.fundsObligatedAmt award_id = ctx.id awardee_name = tools.Try(ctx.awardeeName) awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode) date = ctx.date
class Preprint(osf.Project): description = tools.Try(ctx.attributes.abstract) date_updated = tools.ParseDate(ctx.attributes.date_modified) date_published = tools.ParseDate(ctx.attributes.date_created) # NOTE: OSF has a direct mapping to SHARE's taxonomy. Subjects() is not needed subjects = tools.Map(tools.Delegate(ThroughSubjects), ctx.attributes.subjects) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.self, ctx.links.html, tools.Try(ctx.links.doi)) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.attributes.tags)) rights = tools.Try(ctx.attributes.node_license) related_works = tools.Static([]) related_agents = tools.Concat( tools.Map( tools.Delegate(osf.Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(osf.Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), )
class CreativeWork(Parser): title = ctx.title identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx) related_agents = tools.Concat( tools.Map(tools.Delegate(FunderRelation), ctx), tools.Map(tools.Delegate(ContributorRelation), ctx), tools.Map(tools.Delegate(AgentWorkRelation), tools.Filter(lambda x: 'awardeeName' in x, ctx))) date_updated = tools.ParseDate(ctx.date) class Extra: public_access_mandate = ctx.publicAccessMandate
class CreativeWork(Parser): title = ctx.attributes.title description = ctx.attributes.description is_deleted = tools.Static(False) # date_published = date_updated = tools.ParseDate(ctx.attributes.date_modified) # free_to_read_type = # free_to_read_date = # rights = tools.Try(ctx.attributes.node_license) Doesn't seem to have an useful information # language = identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.html, ctx.links.self) tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags) class Extra: date_created = tools.ParseDate(ctx.attributes.date_created)
class DataSet(Parser): title = tools.Join(tools.Try(ctx.record.metadata.DIF.Entry_Title)) description = tools.Try(ctx.record.metadata.DIF.Summary.Abstract) related_agents = tools.Map( tools.Delegate(AgentWorkRelation), tools.Try(ctx.record.metadata.DIF.Data_Center) ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Try(ctx.record.metadata.DIF.Metadata_Name), tools.Try(ctx.record.header.setSpec) ) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.record.metadata.DIF)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: status = tools.Try(ctx.record.header['@status']) entry_id = tools.Try(ctx.record.metadata.DIF.Entry_ID) metadata_name = tools.Try(ctx.record.metadata.DIF.Metadata_Name) metadata_version = tools.Try(ctx.record.metadata.DIF.Metadata_Version) last_dif_revision_date = tools.Try(ctx.record.metadata.DIF.Last_DIF_Revision_Date) set_spec = ctx.record.header.setSpec def check_status(self, status): if status == 'deleted': return True return False
class Extra: date_created = tools.ParseDate(ctx.attributes.date_created)
class CreativeWork(Parser): ''' Documentation for Datacite's metadata: https://schema.labs.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf ''' def get_schema(self, type): return { 'dataset': 'DataSet', 'software': 'Software', 'text/book': 'Book', 'text/book chapter': 'Book', 'text/book prospectus': 'Book', 'text/book series': 'Book', 'text/conference abstract': 'ConferencePaper', 'text/conference paper': 'ConferencePaper', 'text/conference poster': 'Poster', 'text/dissertation': 'Dissertation', 'text/edited book': 'Book', 'text/journal article': 'Article', 'text/journal issue': 'Article', 'text/patent': 'Patent', 'text/report': 'Report', 'text/supervised student publication': 'Thesis', 'text/working paper': 'WorkingPaper' # 'audiovisual': '', # 'collection': '', # 'event': '', # 'image': '', # 'interactiveresource': '', # 'model': '', # 'physicalobject': '', # 'service': '', # 'sound': '', # 'text15': '', # 'workflow': '', # 'text/book review': '', # 'text/conference program': '', # 'text/dictionary entry': '', # 'text/disclosure': '', # 'text/encyclopedia entry': '', # 'text/Funding submission': '', # 'text/license': '', # 'text/magazine article': '', # 'text/manual': '', # 'text/newsletter article': '', # 'text/newspaper article': '', # 'text/online resource': '', # 'text/registered copyright': '', # 'text/research tool': '', # 'text/tenure-promotion': '', # 'text/test': '', # 'text/trademark': '', # 'text/translation': '', # 'text/university academic unit': '', # 'text/website': '', }.get(type.lower()) or 'CreativeWork' schema = tools.RunPython( 'get_schema', tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral'], default='CreativeWork')) title = tools.RunPython( force_text, tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title), first_str=True) description = tools.Try( tools.Join( tools.RunPython( force_text, tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. descriptions.description)))) rights = tools.Try( tools.Join( tools.RunPython( force_text, tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights)))) language = tools.ParseLanguage( tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.language)) related_agents = tools.Concat( tools.Map( tools.Delegate(CreatorRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. creators.creator))), tools.Map( tools.Delegate(ContributorRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. contributors.contributor))), tools.Map( tools.Delegate(PublisherRelation), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. publisher)), tools.Map( tools.Delegate(HostRelation), tools.RunPython( get_contributors, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), ['HostingInstitution'])), # v.3 Funder is a contributor type # v.4 FundingReference replaces funder contributor type tools.Map( tools.Delegate(FunderRelation), tools.RunPython( get_contributors, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), ['Funder'])), tools.Map( tools.Delegate(FunderRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. fundingReference)))) # v.4 New, free text, 'subjectScheme' attribute on subject subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject), )))) tags = tools.Map( tools.Delegate(ThroughTags), tools.RunPython( force_text, tools.Concat( tools.Maybe( tools.Maybe(ctx.record, 'metadata')['oai_datacite'], 'type'), tools.RunPython(force_text, (tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject)))), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. formats.format), tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['#text']), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral']), tools.Maybe(ctx.record.header, 'setSpec'), tools.Maybe(ctx.record.header, '@status')))) identifiers = tools.Concat( tools.Map( tools.Delegate(WorkIdentifier), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. identifier))), tools.Map( tools.Delegate( WorkIdentifier, tools.RunPython(force_text, ctx.alternateIdentifier)), ctx.record. metadata['oai_datacite'].payload.resource.alternateIdentifiers), ) related_works = tools.Concat( tools.Map( tools.Delegate(WorkRelation), tools.RunPython( get_related_works, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.relatedIdentifiers.relatedIdentifier)), False)), tools.Map( tools.Delegate(InverseWorkRelation), tools.RunPython( get_related_works, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.relatedIdentifiers.relatedIdentifier)), True))) date_updated = tools.ParseDate(tools.Try(ctx.record.header.datestamp)) date_published = tools.ParseDate( tools.Try( tools.RunPython( 'get_date_type', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.dates.date), 'Issued'))) free_to_read_type = tools.Try( tools.IRI(ctx.record.metadata['oai_datacite'].payload.resource. rightsList.rights['@rightsURI']), exceptions=(ValueError, )) free_to_read_date = tools.ParseDate( tools.Try( tools.RunPython( 'get_date_type', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.dates.date), 'Available'))) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ status = tools.Try(ctx.record.header['@status']) datestamp = tools.ParseDate(ctx.record.header.datestamp) set_spec = tools.Try(ctx.record.header.setSpec) is_reference_quality = tools.Try( ctx.record.metadata['oai_datacite'].isReferenceQuality) schema_version = tools.Try( ctx.record.metadata['oai_datacite'].schemaVersion) datacentre_symbol = tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol) identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.identifier) alternate_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier) titles = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title) publisher = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.publisher) publication_year = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.publicationYear) subject = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject) resourceType = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.resourceType) sizes = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.size) format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.formats.format) version = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.version) rights = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rights) rightsList = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rightsList) related_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. relatedIdentifiers.relatedIdentifier) description = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.descriptions) dates = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.dates.date) contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor) creators = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.creators) # v.4 new property geoLocationPolygon, in addition to geoLocationPoint and geoLocationBox geolocations = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.geoLocations) funding_reference = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.fundingReference) def check_status(self, status): if status == 'deleted': return True return False def get_date_type(self, date_obj, date_type): date = None for obj in date_obj: if obj['@dateType'] == date_type: date = obj['#text'] if date and date != '0000': return date # raise KeyError to break TryLink raise KeyError()
class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ status = tools.Try(ctx.record.header['@status']) datestamp = tools.ParseDate(ctx.record.header.datestamp) set_spec = tools.Try(ctx.record.header.setSpec) is_reference_quality = tools.Try( ctx.record.metadata['oai_datacite'].isReferenceQuality) schema_version = tools.Try( ctx.record.metadata['oai_datacite'].schemaVersion) datacentre_symbol = tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol) identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.identifier) alternate_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier) titles = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title) publisher = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.publisher) publication_year = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.publicationYear) subject = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject) resourceType = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.resourceType) sizes = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.size) format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.formats.format) version = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.version) rights = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rights) rightsList = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rightsList) related_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. relatedIdentifiers.relatedIdentifier) description = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.descriptions) dates = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.dates.date) contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor) creators = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.creators) # v.4 new property geoLocationPolygon, in addition to geoLocationPoint and geoLocationBox geolocations = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.geoLocations) funding_reference = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.fundingReference)
class OAICreativeWork(Parser): default_type = None type_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None))) title = tools.Join( tools.RunPython(force_text, tools.Try(ctx.record.metadata.dc['dc:title']))) description = tools.Join( tools.RunPython(force_text, tools.Try(ctx.record.metadata.dc['dc:description']))) identifiers = tools.Map( tools.Delegate(OAIWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(InvalidIRI, )), tools.Filter( not_citation, tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier']))))))) related_works = tools.Concat( tools.Map( tools.Delegate(OAIWorkRelation), tools.Unique( tools.Map(tools.Try(tools.IRI(), exceptions=(InvalidIRI, )), tools.RunPython('get_relation', ctx))))) related_agents = tools.Concat( tools.Map(tools.Delegate(OAICreator), tools.Try(ctx.record.metadata.dc['dc:creator'])), tools.Map(tools.Delegate(OAIContributor), tools.Try(ctx.record.metadata.dc['dc:contributor'])), tools.Map( tools.Delegate(OAIPublisher), tools.RunPython(force_text, tools.Try( ctx.record.metadata.dc['dc:publisher']))), ) rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights'])) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx.record.metadata.dc['dc:language'][0]), ) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))), deep=True)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An agent responsible for making contributions to the resource. contributor = tools.Try(ctx.record.metadata.dc['dc:contributor']) # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Try(ctx.record.metadata.dc['dc:coverage']) # An agent primarily responsible for making the resource. creator = tools.Try(ctx.record.metadata.dc['dc:creator']) # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Try(ctx.record.metadata.dc['dc:date']) # The file format, physical medium, or dimensions of the resource. resource_format = tools.Try(ctx.record.metadata.dc['dc:format']) # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier'])) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Try(ctx.record.metadata.dc['dc:source']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Try(ctx.record.header.setSpec) # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Try(ctx.record.header['@status']) def check_status(self, status): if status == 'deleted': return True return False def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation') or [] identifiers = ctx['record']['metadata']['dc'].get( 'dc:identifier') or [] if isinstance(identifiers, dict): identifiers = (identifiers, ) identifiers = ''.join(i['#text'] if isinstance(i, dict) else i for i in identifiers if i) identifiers = re.sub( 'http|:|/', '', identifiers + ctx['record']['header']['identifier']) if isinstance(relation, dict): relation = (relation['#text'], ) return [ r for r in relation if r and re.sub('http|:|/', '', r) not in identifiers ]
class MODSCreativeWork(Parser): default_type = 'CreativeWork' type_map = None role_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(tools.RunPython(force_text, ctx['mods:genre']), tools.Static(None))) title = tools.RunPython('join_title_info', ctx) # Abstracts have the optional attribute "shareable". Don't bother checking for it, because # abstracts that are not shareable should not have been shared with SHARE. description = tools.Join( tools.RunPython(force_text, tools.Try(ctx['mods:abstract']), '\n')) identifiers = tools.Map( tools.Delegate(MODSWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Map( tools.RunPython(force_text), tools.Filter( lambda obj: 'invalid' not in obj, tools.Concat( tools.Try(ctx['mods:identifier']), tools.Try(ctx.header['identifier']), tools.Try(ctx['mods:location']['mods:url']), )))))) related_works = tools.Concat( tools.Map(tools.Delegate(MODSWorkRelation), tools.Try(ctx['mods:relatedItem']))) related_agents = tools.Concat( tools.Map(tools.Delegate(MODSCreator), tools.RunPython('filter_names', ctx, 'creator')), tools.Map(tools.Delegate(MODSFunder), tools.RunPython('filter_names', ctx, 'funder')), tools.Map(tools.Delegate(MODSHost), tools.RunPython('filter_names', ctx, 'host')), tools.Map(tools.Delegate(MODSPublisher), tools.RunPython('filter_names', ctx, 'publisher')), tools.Map( tools.Delegate(MODSContributor), tools.RunPython('filter_names', ctx, 'creator', 'funder', 'host', 'publisher', invert=True)), tools.Map( tools.Delegate(MODSSimplePublisher), tools.Try(ctx['mods:originInfo']['mods:publisher']), ), ) rights = tools.RunPython(force_text, tools.Try(ctx['mods:accessCondition']), '\n') language = tools.ParseLanguage( tools.Try(ctx['mods:language']['mods:languageTerm']), ) subjects = tools.Map( tools.Delegate(MODSThroughSubjects), tools.Subjects( tools.Concat(tools.Try(ctx['mods:subject']['mods:topic']), ))) tags = tools.Map( tools.Delegate(MODSThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.Map( tools.RunPython(force_text), tools.Try(ctx.header.setSpec), tools.Try(ctx['mods:genre']), tools.Try(ctx['mods:classification']), tools.Try(ctx['mods:subject']['mods:topic']), )), deep=True)) date_updated = tools.ParseDate(tools.Try(ctx.header.datestamp)) # TODO (in regulator) handle date ranges, uncertain dates ('1904-1941', '1890?', '1980-', '19uu', etc.) date_published = tools.OneOf( tools.ParseDate( tools.RunPython( force_text, tools.Try(ctx['mods:originInfo']['mods:dateIssued']))), tools.Static(None)) is_deleted = tools.RunPython(lambda status: status == 'deleted', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # (dc:description) http://www.loc.gov/standards/mods/userguide/abstract.html abstract = tools.Try(ctx['mods:abstract']) # (dc:rights) http://www.loc.gov/standards/mods/userguide/accesscondition.html accessConditions = tools.Try(ctx['mods:accessCondition']) # (dc:subject) http://www.loc.gov/standards/mods/userguide/classification.html classification = tools.Try(ctx['mods:classification']) # (N/A) http://www.loc.gov/standards/mods/userguide/extension.html extension = tools.Try(ctx['mods:extension']) # SHARE type # (dc:type) http://www.loc.gov/standards/mods/userguide/genre.html genre = tools.Try(ctx['mods:genre']) # (dc:identifier) http://www.loc.gov/standards/mods/userguide/identifier.html identifier = tools.Try(ctx['mods:identifier']) # (dc:language) http://www.loc.gov/standards/mods/userguide/language.html language = tools.Try(ctx['mods:language']) # (dc:identifier for url) http://www.loc.gov/standards/mods/userguide/location.html location = tools.Try(ctx['mods:location']) # (dc:creator|dc:contributor) http://www.loc.gov/standards/mods/userguide/name.html name = tools.Try(ctx['mods:name']) # (dc:description) http://www.loc.gov/standards/mods/userguide/note.html note = tools.Try(ctx['mods:note']) # (dc:publisher|dc:date) http://www.loc.gov/standards/mods/userguide/origininfo.html originInfo = tools.Try(ctx['mods:originInfo']) # Extra # (dc:title) http://www.loc.gov/standards/mods/userguide/part.html part = tools.Try(ctx['mods:part']) # (dc:format or N/A) http://www.loc.gov/standards/mods/userguide/physicaldescription.html physicalDescription = tools.Try(ctx['mods:physicalDescription']) # Metadata information # (N/A) http://www.loc.gov/standards/mods/userguide/recordinfo.html recordInfo = tools.Try(ctx['mods:recordInfo']) # (dc:relation) http://www.loc.gov/standards/mods/userguide/relateditem.html relatedItem = tools.Try(ctx['mods:relatedItem']) # (dc:subject|dc:type|dc:coverage|N/A) http://www.loc.gov/standards/mods/userguide/subject.html subject = tools.Try(ctx['mods:subject']) # (dc:description) http://www.loc.gov/standards/mods/userguide/tableofcontents.html tableOfContents = tools.Try(ctx['mods:tableOfContents']) # (N/A) http://www.loc.gov/standards/mods/userguide/targetaudience.html targetAudience = tools.Try(ctx['mods:targetAudience']) # (dc:title) http://www.loc.gov/standards/mods/userguide/titleinfo.html titleInfo = tools.Try(ctx['mods:titleInfo']) # Extra # (dc:type) http://www.loc.gov/standards/mods/userguide/typeofresource.html typeOfResource = tools.Try(ctx['mods:typeOfResource']) def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens # Map titleInfos to a string: https://www.loc.gov/standards/mods/userguide/titleinfo.html#mappings def join_title_info(self, obj): def get_part(title_info, part_name, delimiter=''): part = force_text(title_info.get(part_name, ''), ' ').strip() return delimiter + part if part else '' title_infos = get_list(obj, 'mods:titleInfo') titles = [] for title_info in title_infos: title = '' title += get_part(title_info, 'mods:nonSort') title += get_part(title_info, 'mods:title') title += get_part(title_info, 'mods:subTitle', ': ') title += get_part(title_info, 'mods:partNumber', '. ') title += get_part(title_info, 'mods:partName', ': ') if title: titles.append(title) return '. '.join(titles) def filter_names(self, obj, *roles, invert=False): names = get_list(obj, 'mods:name') filtered = [*names] if invert else [] for name in names: name_roles = get_list(name, 'mods:role') for role in name_roles: role_terms = get_list(role, 'mods:roleTerm') name_roles = {force_text(r).lower() for r in role_terms} name_roles.update({ self.role_map[r] for r in name_roles if r in self.role_map }) if name_roles.intersection(roles): if invert: filtered.remove(name) else: filtered.append(name) return filtered
class CreativeWork(Parser): # https://www.research.gov/common/webapi/awardapisearch-v1.htm#request-parameters title = ctx.title description = ctx.abstractText identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx) related_agents = tools.Concat( tools.Map(tools.Delegate(FunderRelation), ctx), tools.Map(tools.Delegate(PIContributorRelation), ctx), tools.Map(tools.Delegate(POContributorRelation), tools.Filter(lambda x: x.get('poName') is not None, ctx)), tools.Map( tools.Delegate(AgentWorkRelation), tools.Filter(lambda x: x.get('awardeeName') is not None, ctx))) date_updated = tools.ParseDate(ctx.date) class Extra: catalog_of_federal_domestic_assistance_number = tools.Try( ctx.cfdaNumber) estimated_total_amt = tools.Try(ctx.estimatedTotalAmt) fund_program_name = tools.Try(ctx.fundProgramName) has_project_outcomes_report = tools.Try(ctx.projectOutComesReport) primary_program = tools.Try(ctx.primaryProgram) public_access_mandate = tools.Try(ctx.publicAccessMandate) transaction_type = tools.Try(ctx.transType) co_pi_name = tools.Try( ctx.coPDPI) # irregular field (ex. [First Last ~<numbers>, ...]) proj_dir_pi_name = tools.Try(ctx.pdPIName) duns_number = tools.Try(ctx.dunsNumber) parent_duns_number = tools.Try(ctx.parentDunsNumber) fund_agency_code = tools.Try(ctx.fundAgencyCode) award_agency_code = tools.Try(ctx.awardAgencyCode) publication_research = tools.Try(ctx.publicationResearch) publication_conference = tools.Try(ctx.publicationConference) po_name = tools.Try(ctx.poName) po_email = tools.Try(ctx.poEmail) date = ctx.date date_start = tools.Try(ctx.startDate) date_expiration = tools.Try(ctx.expDate) pi_last_name = ctx.piLastName pi_first_name = ctx.piFirstName pi_middle_initial = tools.Try(ctx.piMiddeInitial) pi_email = tools.Try(ctx.piEmail) awardee = tools.Try(ctx.awardee) awardee_address = tools.Try(ctx.awardeeAddress) awardee_city = tools.Try(ctx.awardeeCity) awardee_country_code = tools.Try(ctx.awardeeCountryCode) awardee_county = tools.Try(ctx.awardeeCounty) awardee_district_code = tools.Try(ctx.awardeeDistrictCode) awardee_name = tools.Try(ctx.awardeeName) awardee_state_code = tools.Try(ctx.awardeeStateCode) awardee_zip_code = tools.Try(ctx.awardeeZipCode) performance_address = tools.Try(ctx.perfAddress) performance_city = tools.Try(ctx.perfCity) performance_country_code = tools.Try(ctx.perfCountryCode) performance_county = tools.Try(ctx.perfCounty) performance_district_code = tools.Try(ctx.perfDistrictCode) performance_location = tools.Try(ctx.perfLocation) performance_state_code = tools.Try(ctx.perfStateCode) performance_zip_code = tools.Try(ctx.perfZipCode)