class Extra: funds_obligated_amt = ctx.fundsObligatedAmt award_id = ctx.id awardee_name = tools.Try(ctx.awardeeName) awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode) date = ctx.date
class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType)
class Person(Parser): given_name = tools.ParseName(ctx.author).first family_name = tools.ParseName(ctx.author).last additional_name = tools.ParseName(ctx.author).middle suffix = tools.ParseName(ctx.author).suffix identifiers = tools.Map( tools.Delegate(AgentIdentifier, tools.Try(ctx.email))) related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.institution))
class Extra: status = tools.Try(ctx.record.header['@status']) entry_id = tools.Try(ctx.record.metadata.DIF.Entry_ID) metadata_name = tools.Try(ctx.record.metadata.DIF.Metadata_Name) metadata_version = tools.Try(ctx.record.metadata.DIF.Metadata_Version) last_dif_revision_date = tools.Try(ctx.record.metadata.DIF.Last_DIF_Revision_Date) set_spec = ctx.record.header.setSpec
class MODSAgent(Parser): schema = tools.RunPython('get_agent_schema', ctx) name = tools.OneOf(tools.RunPython(force_text, ctx['mods:displayForm']), tools.RunPython('squash_name_parts', ctx)) related_agents = tools.Map( tools.Delegate(IsAffiliatedWith), tools.Concat( tools.Try( tools.Filter( lambda x: bool(x), tools.RunPython(force_text, ctx['mods:affiliation']))))) identifiers = tools.Map( tools.Delegate(MODSAgentIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Map( tools.RunPython(force_text), tools.Filter( lambda obj: 'invalid' not in obj, tools.Try(ctx['mods:nameIdentifier']), ))))) class Extra: name_type = tools.Try(ctx['@type']) name_part = tools.Try(ctx['mods:namePart']) affiliation = tools.Try(ctx['mods:affiliation']) description = tools.Try(ctx['mods:description']) display_form = tools.Try(ctx['mods:displayForm']) etal = tools.Try(ctx['mods:etal']) name_identifier = tools.Try(ctx['mods:nameIdentifier']) def squash_name_parts(self, name): name_parts = get_list(name, 'mods:namePart') return ' '.join([force_text(n) for n in name_parts]) def get_agent_schema(self, obj): name_type = obj.get('@type') if name_type == 'personal': return 'person' if name_type == 'conference': return 'organization' # TODO SHARE-718 # if name_type == 'family': # return 'family' if name_type == 'corporate': return GuessAgentTypeLink(default='organization').execute( self.squash_name_parts(obj)) return GuessAgentTypeLink().execute(self.squash_name_parts(obj))
class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version)
class FunderAgent(Parser): schema = tools.GuessAgentType(tools.OneOf(ctx.funderName, ctx.contributorName), default='organization') name = tools.OneOf(ctx.funderName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI( tools.OneOf(ctx.funderIdentifier, tools.RunPython(force_text, ctx.nameIdentifier), tools.Static(None))), exceptions=(ValueError, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) funder_identifier = tools.Try(ctx.funderIdentifier) funder_identifier_type = tools.Try(ctx.funderIdentifierType) contributor_type = tools.Try(ctx.contributorType)
class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
def get_root_parser(self, unwrapped, emitted_type='creativework', type_map=None, property_list=None, **kwargs): root_type_map = { **{r.lower(): r for r in self.allowed_roots}, **{t.lower(): v for t, v in (type_map or {}).items()} } class RootParser(OAICreativeWork): default_type = emitted_type.lower() type_map = root_type_map if property_list: logger.debug( 'Attaching addition properties %s to transformer for %s'. format(property_list, self.config.label)) for prop in property_list: if prop in RootParser._extra: logger.warning('Skipping property %s, it already exists', prop) continue RootParser._extra[prop] = tools.Try( ctx.record.metadata.dc['dc:' + prop]).chain()[0] return RootParser
class Person(Parser): given_name = tools.OneOf( ctx.embeds.users.data.attributes.given_name, ctx.embeds.users.errors[0].meta.given_name, ) family_name = tools.OneOf( ctx.embeds.users.data.attributes.family_name, ctx.embeds.users.errors[0].meta.family_name, ) additional_name = tools.OneOf( ctx.embeds.users.data.attributes.middle_names, ctx.embeds.users.errors[0].meta.middle_names, ) suffix = tools.OneOf( ctx.embeds.users.data.attributes.suffix, ctx.embeds.users.errors[0].meta.suffix, ) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.RunPython('registered', ctx.embeds.users.data.links.html), tools.Try(ctx.embeds.users.data.links.profile_image), ) class Extra: locale = tools.Try(ctx.embeds.users.data.attributes.locale) date_registered = tools.Try( ctx.embeds.users.data.attributes.date_registered) active = tools.Try(ctx.embeds.users.data.attributes.active) timezone = tools.Try(ctx.embeds.users.data.attributes.timezone) def registered(self, context): if self.context['attributes']['unregistered_contributor']: return None return context
class FundingAgent(Parser): schema = tools.GuessAgentType(ctx.sponsorName, default='organization') name = ctx.sponsorName identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.IRI(tools.Try(ctx.sponsorIdentifier)))
class Project(CreativeWork): is_root = True related_works = tools.Map(tools.Delegate(IsPartOf), tools.Try(ctx.children)) related_agents = tools.Concat( tools.Map( tools.Delegate(Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), tools.Map(tools.Delegate(AgentWorkRelation), tools.Try(ctx.institutions)), )
class Agent(Parser): schema = tools.GuessAgentType(ctx.name) name = ctx.name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.affiliation)) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Map(tools.IRI(), tools.Try(ctx.sameAs), tools.Try(ctx.email))) class Extra: givenName = tools.Try(ctx.givenName) familyName = tools.Try(ctx.familyName) additonalName = tools.Try(ctx.additionalName) name = tools.Try(ctx.name)
class Extra: awardee = tools.Try(ctx.awardee) awardee_address = tools.Try(ctx.awardeeAddress) awardee_name = ctx.awardeeName awardee_city = tools.Try(ctx.awardeeCity) awardee_county = tools.Try(ctx.awardeeCounty) awardee_state_code = tools.Try(ctx.awardeeStateCode) awardee_country_code = tools.Try(ctx.awardeeCountryCode) awardee_district_code = tools.Try(ctx.awardeeDistrictCode) awardee_zip_code = tools.Try(ctx.awardeeZipCode)
class HostAgent(Parser): schema = tools.GuessAgentType(ctx.contributorName, default='organization') name = tools.Try(ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI(tools.RunPython(force_text, ctx.nameIdentifier)), exceptions=(InvalidIRI, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType)
class Extra: name_type = tools.Try(ctx['@type']) name_part = tools.Try(ctx['mods:namePart']) affiliation = tools.Try(ctx['mods:affiliation']) description = tools.Try(ctx['mods:description']) display_form = tools.Try(ctx['mods:displayForm']) etal = tools.Try(ctx['mods:etal']) name_identifier = tools.Try(ctx['mods:nameIdentifier'])
class Registration(Parser): title = tools.Try(ctx['general-information']['title']) description = tools.Try(ctx['additional-trial-info']['abstract']) date_updated = tools.ParseDate(tools.Try(ctx['general-information']['last-updated'])) date_published = tools.ParseDate(tools.Try(ctx['general-information']['published-at'])) related_agents = tools.Map(tools.Delegate(Creator), tools.Try(ctx.pi)) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.IRI(ctx['general-information']['url'])), ) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython( process_keywords, tools.Try(ctx['additional-trial-info']['keywords']), ) ) ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Concat( tools.RunPython( process_keywords, tools.Try(ctx['additional-trial-info']['keywords']), ), tools.Try(ctx['additional-trial-info']['status']), tools.Try(ctx['additional-trial-info']['jel-code']) ) ) class Extra: general_information = tools.Try(ctx['general-information']) additional_trial_information = tools.Try(ctx['additional-trial-info']) publication_data = tools.Try(ctx['data-publication']) primary_investigator = tools.Try(ctx['pi']) interventions = tools.Try(ctx['interventions']) outcomes = tools.Try(ctx['outcomes']) experimental_design = tools.Try(ctx['experimental-design']) experimental_characteristics = tools.Try(ctx['experimental-characteristics']) supporting_document_material = tools.Try(ctx['supporting-doc-material']) post_trial = tools.Try(ctx['post-trial']) reports_papers = tools.Try(ctx['reports-papers'])
class FunderRelation(Parser): schema = 'Funder' agent = tools.Delegate(FunderAgent, ctx) awards = tools.Map(tools.Delegate(ThroughAwards), tools.Try(tools.RunPython('get_award', ctx))) def get_award(self, obj): obj['awardURI'] return obj
class PIContributorAgent(Parser): schema = 'Person' family_name = ctx.piLastName given_name = ctx.piFirstName additional_name = tools.Try(ctx.piMiddeInitial) related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Filter(lambda x: 'awardeeName' in x, ctx)) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI(ctx.piEmail), exceptions=(ValueError, ))) class Extra: pi_last_name = ctx.piLastName pi_first_name = ctx.piFirstName pi_middle_initial = tools.Try(ctx.piMiddeInitial) pi_email = tools.Try(ctx.piEmail)
class DataCenterAgent(Parser): schema = tools.GuessAgentType( ctx.Data_Center_Name.Long_Name, default='organization' ) name = ctx.Data_Center_Name.Long_Name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.Personnel)) class Extra: data_center_short_name = ctx.Data_Center_Name.Short_Name
class AffiliatedAgent(Parser): schema = tools.GuessAgentType(ctx.awardeeName, default='organization') name = ctx.awardeeName location = tools.Join(tools.Concat(ctx.awardeeCity, tools.Try(ctx.awardeeStateCode)), joiner=', ') class Extra: awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode)
class Extra: access_rights = tools.Try(ctx['access-rights']) usage_rights = tools.Try(ctx['usage-rights']) collection_statistics = tools.Try(ctx['collection-statistics']) management = tools.Try(ctx['management']) collection_type = tools.Try(ctx['collection-type']) last_update = tools.ParseDate(tools.Try(ctx['last-update']))
class ContributorAgent(Parser): schema = tools.OneOf( tools.GuessAgentType(tools.RunPython(get_agent_type, ctx, person=False), default='organization'), tools.GuessAgentType(tools.OneOf(ctx.creatorName, ctx.contributorName))) name = tools.OneOf(ctx.creatorName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.Map(tools.IRI(ctx), tools.RunPython(force_text, ctx.nameIdentifier)), exceptions=(ValueError, ))) related_agents = tools.Map( tools.Delegate(IsAffiliatedWith), tools.Concat( tools.Try( tools.Filter(lambda x: bool(x), tools.RunPython(force_text, ctx.affiliation))))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class Preprint(osf.Project): description = tools.Try(ctx.attributes.abstract) date_updated = tools.ParseDate(ctx.attributes.date_modified) date_published = tools.ParseDate(ctx.attributes.date_created) # NOTE: OSF has a direct mapping to SHARE's taxonomy. Subjects() is not needed subjects = tools.Map(tools.Delegate(ThroughSubjects), ctx.attributes.subjects) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.self, ctx.links.html, tools.Try(ctx.links.doi)) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.attributes.tags)) rights = tools.Try(ctx.attributes.node_license) related_works = tools.Static([]) related_agents = tools.Concat( tools.Map( tools.Delegate(osf.Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(osf.Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), )
class POContributorAgent(Parser): schema = 'Person' name = ctx.poName identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI(ctx.poEmail), exceptions=(ValueError, ))) related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Filter(lambda x: 'awardeeName' in x, ctx)) class Extra: po_name = tools.Try(ctx.poName) po_email = tools.Try(ctx.poEmail)
class CreativeWork(Parser): title = ctx.attributes.title description = ctx.attributes.description is_deleted = tools.Static(False) # date_published = date_updated = tools.ParseDate(ctx.attributes.date_modified) # free_to_read_type = # free_to_read_date = # rights = tools.Try(ctx.attributes.node_license) Doesn't seem to have an useful information # language = identifiers = tools.Concat( tools.Map(tools.Delegate(SimpleWorkIdentifier), ctx.links.html, ctx.links.self), tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.identifiers))) tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags) class Extra: date_created = tools.ParseDate(ctx.attributes.date_created)
class Dataset(Parser): title = tools.Try(ctx['title']) description = tools.Try(ctx['description']) rights = tools.Try( tools.Join( tools.Concat(tools.Try(ctx['access-rights']), tools.Try(ctx['usage-rights'])))) related_agents = tools.Map(tools.Delegate(Creator), tools.Try(ctx.contact)) class Extra: access_rights = tools.Try(ctx['access-rights']) usage_rights = tools.Try(ctx['usage-rights']) collection_statistics = tools.Try(ctx['collection-statistics']) management = tools.Try(ctx['management']) collection_type = tools.Try(ctx['collection-type']) last_update = tools.ParseDate(tools.Try(ctx['last-update']))
class DataSet(Parser): title = tools.Join(tools.Try(ctx.record.metadata.DIF.Entry_Title)) description = tools.Try(ctx.record.metadata.DIF.Summary.Abstract) related_agents = tools.Map( tools.Delegate(AgentWorkRelation), tools.Try(ctx.record.metadata.DIF.Data_Center) ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Try(ctx.record.metadata.DIF.Metadata_Name), tools.Try(ctx.record.header.setSpec) ) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.record.metadata.DIF)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: status = tools.Try(ctx.record.header['@status']) entry_id = tools.Try(ctx.record.metadata.DIF.Entry_ID) metadata_name = tools.Try(ctx.record.metadata.DIF.Metadata_Name) metadata_version = tools.Try(ctx.record.metadata.DIF.Metadata_Version) last_dif_revision_date = tools.Try(ctx.record.metadata.DIF.Last_DIF_Revision_Date) set_spec = ctx.record.header.setSpec def check_status(self, status): if status == 'deleted': return True return False
class Extra: general_information = tools.Try(ctx['general-information']) additional_trial_information = tools.Try(ctx['additional-trial-info']) publication_data = tools.Try(ctx['data-publication']) primary_investigator = tools.Try(ctx['pi']) interventions = tools.Try(ctx['interventions']) outcomes = tools.Try(ctx['outcomes']) experimental_design = tools.Try(ctx['experimental-design']) experimental_characteristics = tools.Try(ctx['experimental-characteristics']) supporting_document_material = tools.Try(ctx['supporting-doc-material']) post_trial = tools.Try(ctx['post-trial']) reports_papers = tools.Try(ctx['reports-papers'])