class Extra: name_identifier = tools.Try( tools.RunPython(force_text, ctx.nameIdentifier)) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI'])
class Extra: funds_obligated_amt = ctx.fundsObligatedAmt award_id = ctx.id awardee_name = tools.Try(ctx.awardeeName) awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode) date = ctx.date
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) contributors = tools.Map(tools.Delegate(Contributor), ctx.entry.author) links = tools.Map(tools.Delegate(ThroughLinks), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython('get_subjects', tools.Concat(tools.Try(ctx.entry.category))))) tags = tools.Map(tools.Delegate(ThroughTags), ctx.entry.category) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category']) def get_subjects(self, link): return list(map((lambda category: category['@term']), link))
class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType)
class Venue(Parser): name = tools.Try(ctx.awardeeName) location = tools.Join(tools.Concat(ctx.awardeeCity, tools.Try(ctx.awardeeStateCode)), joiner=', ') class Extra: awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode)
class Person(Parser): given_name = tools.ParseName(ctx.author).first family_name = tools.ParseName(ctx.author).last additional_name = tools.ParseName(ctx.author).middle suffix = tools.ParseName(ctx.author).suffix identifiers = tools.Map( tools.Delegate(AgentIdentifier, tools.Try(ctx.email))) related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.institution))
class Person(Parser): given_name = tools.ParseName(ctx.author).first family_name = tools.ParseName(ctx.author).last additional_name = tools.ParseName(ctx.author).middle suffix = tools.ParseName(ctx.author).suffix emails = tools.Map(tools.Delegate(PersonEmail), tools.Try(ctx.email)) affiliations = tools.Map( tools.Delegate(Affiliation.using(entity=tools.Delegate(Organization))), tools.Try(ctx.institution))
class Funder(Parser): community_identifier = tools.Join(tools.Concat( tools.Try(ctx.nameIdentifier['@schemeURI']), tools.Try(ctx.nameIdentifier['#text'])), joiner='/') class Extra: name = tools.Try(ctx.contributorName) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI'])
class Extra: status = tools.Try(ctx.record.header['@status']) entry_id = tools.Try(ctx.record.metadata.DIF.Entry_ID) metadata_name = tools.Try(ctx.record.metadata.DIF.Metadata_Name) metadata_version = tools.Try(ctx.record.metadata.DIF.Metadata_Version) last_dif_revision_date = tools.Try( ctx.record.metadata.DIF.Last_DIF_Revision_Date) set_spec = ctx.record.header.setSpec
class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An entity responsible for making contributions to the resource. contributor = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor') # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage') # An entity primarily responsible for making the resource. creator = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator') # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date') # The file format, physical medium, or dimensions of the resource. resource_format = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format') # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx['record']['metadata']['dc']['dc:identifier']), tools.Maybe(ctx['record']['header'], 'identifier')) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source') # The topic of the resource. subject = tools.Try(ctx.record.metadata.dc['dc:subject']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Maybe(ctx.record.header, 'setSpec') # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Maybe(ctx.record.header, '@status')
class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version)
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) # free_to_read_type # free_to_read_date # rights # language subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects(tools.Map(ctx['@term'], ctx.entry.category)), ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Map(ctx['@term'], ctx.entry.category), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), ctx.entry.author), ) # related_works identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category'])
class Person(Parser): given_name = tools.OneOf( ctx.embeds.users.data.attributes.given_name, ctx.embeds.users.errors[0].meta.given_name, ) family_name = tools.OneOf( ctx.embeds.users.data.attributes.family_name, ctx.embeds.users.errors[0].meta.family_name, ) additional_name = tools.OneOf( ctx.embeds.users.data.attributes.middle_names, ctx.embeds.users.errors[0].meta.middle_names, ) suffix = tools.OneOf( ctx.embeds.users.data.attributes.suffix, ctx.embeds.users.errors[0].meta.suffix, ) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.RunPython('registered', ctx.embeds.users.data.links.html), tools.Try(ctx.embeds.users.data.links.profile_image), ) class Extra: locale = tools.Try(ctx.embeds.users.data.attributes.locale) date_registered = tools.Try( ctx.embeds.users.data.attributes.date_registered) active = tools.Try(ctx.embeds.users.data.attributes.active) timezone = tools.Try(ctx.embeds.users.data.attributes.timezone) def registered(self, context): if self.context['attributes']['unregistered_contributor']: return None return context
class FundingAgent(Parser): schema = tools.GuessAgentType(ctx.sponsorName, default='organization') name = ctx.sponsorName identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.IRI(tools.Try(ctx.sponsorIdentifier)))
class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class FunderAgent(Parser): schema = tools.GuessAgentType(tools.OneOf(ctx.funderName, ctx.contributorName), default='organization') name = tools.OneOf(ctx.funderName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI( tools.OneOf(ctx.funderIdentifier, tools.RunPython(force_text, ctx.nameIdentifier), tools.Static(None))), exceptions=(ValueError, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) funder_identifier = tools.Try(ctx.funderIdentifier) funder_identifier_type = tools.Try(ctx.funderIdentifierType) contributor_type = tools.Try(ctx.contributorType)
class Agent(Parser): schema = tools.GuessAgentType(ctx.name) name = ctx.name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.affiliation)) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Map(tools.IRI(), tools.Try(ctx.sameAs), tools.Try(ctx.email))) class Extra: givenName = tools.Try(ctx.givenName) familyName = tools.Try(ctx.familyName) additonalName = tools.Try(ctx.additionalName) name = tools.Try(ctx.name)
class RelatedLink(Parser): schema = 'Link' url = tools.RunPython(force_text, ctx) type = tools.RunPython('lower', tools.Try(ctx['@relatedIdentifierType'])) def lower(self, type): return type.lower()
class Project(CreativeWork): is_root = True related_works = tools.Map(tools.Delegate(IsPartOf), tools.Try(ctx.children)) related_agents = tools.Concat( tools.Map( tools.Delegate(Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), tools.Map(tools.Delegate(AgentWorkRelation), tools.Try(ctx.institutions)), )
class Person(Parser): given_name = ctx.embeds.users.data.attributes.given_name family_name = ctx.embeds.users.data.attributes.family_name additional_name = ctx.embeds.users.data.attributes.middle_names suffix = ctx.embeds.users.data.attributes.suffix identifiers = tools.Map( tools.Delegate(ThroughIdentifiers), tools.Try(ctx.embeds.users.data.links.html), tools.Try(ctx.embeds.users.data.links.profile_image), tools.Try(ctx.embeds.users.errors[0].meta.profile_image)) class Extra: locale = ctx.embeds.users.data.attributes.locale date_registered = ctx.embeds.users.data.attributes.date_registered active = ctx.embeds.users.data.attributes.active timezone = ctx.embeds.users.data.attributes.timezone profile_image = ctx.embeds.users.data.links.profile_image
class Venue(Parser): name = tools.Try(tools.RunPython(force_text, ctx.geoLocationPlace)) # polygon = tools.Try(ctx.geoLocationBox) # point = tools.Try(ctx.geoLocationPoint) class Extra: polygon = tools.Try(ctx.geoLocationBox) point = tools.Try(ctx.geoLocationPoint)
class FunderRelation(Parser): schema = 'Funder' agent = tools.Delegate(FunderAgent, ctx) awards = tools.Map(tools.Delegate(ThroughAwards), tools.Try(tools.RunPython('get_award', ctx))) def get_award(self, obj): obj['awardURI'] return obj
class Person(Parser): given_name = tools.ParseName(ctx.name).first family_name = tools.ParseName(ctx.name).last additional_name = tools.ParseName(ctx.name).middle suffix = tools.ParseName(ctx.name).suffix affiliations = tools.Map( tools.Delegate(Affiliation.using(entity=tools.Delegate(Organization))), tools.Try(ctx['arxiv:affiliation']))
class DataCenterAgent(Parser): schema = tools.GuessAgentType(ctx.Data_Center_Name.Long_Name, default='organization') name = ctx.Data_Center_Name.Long_Name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.Personnel)) class Extra: data_center_short_name = ctx.Data_Center_Name.Short_Name
class AffiliatedAgent(Parser): schema = tools.GuessAgentType(ctx.awardeeName, default='organization') name = ctx.awardeeName location = tools.Join(tools.Concat(ctx.awardeeCity, tools.Try(ctx.awardeeStateCode)), joiner=', ') class Extra: awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode)
class CreatorPerson(Parser): schema = 'Person' suffix = tools.ParseName(ctx.creatorName).suffix family_name = tools.ParseName(ctx.creatorName).last given_name = tools.ParseName(ctx.creatorName).first additional_name = tools.ParseName(ctx.creatorName).middle affiliations = tools.Map( tools.Delegate( Affiliation.using(entity=tools.Delegate(CreatorOrganization))), tools.Concat(tools.Try(tools.RunPython(force_text, ctx.affiliation)))) identifiers = tools.Map(tools.Delegate(ThroughIdentifiers), tools.Try(ctx.nameIdentifier)) class Extra: name_identifier = tools.Try( tools.RunPython(force_text, ctx.nameIdentifier)) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI'])
class ContributorAgent(Parser): schema = tools.OneOf( tools.GuessAgentType(tools.RunPython(get_agent_type, ctx, person=False), default='organization'), tools.GuessAgentType(tools.OneOf(ctx.creatorName, ctx.contributorName))) name = tools.OneOf(ctx.creatorName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI(tools.RunPython(force_text, ctx.nameIdentifier)), exceptions=(ValueError, ))) related_agents = tools.Map( tools.Delegate(IsAffiliatedWith), tools.Concat( tools.Try( tools.Filter(lambda x: bool(x), tools.RunPython(force_text, ctx.affiliation))))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class Preprint(osf.Project): description = tools.Try(ctx.attributes.abstract) date_updated = tools.ParseDate(ctx.attributes.date_modified) date_published = tools.ParseDate(ctx.attributes.date_created) # NOTE: OSF has a direct mapping to SHARE's taxonomy. Subjects() is not needed subjects = tools.Map(tools.Delegate(ThroughSubjects), ctx.attributes.subjects) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.self, ctx.links.html, tools.Try(ctx.links.doi)) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.attributes.tags)) rights = tools.Try(ctx.attributes.node_license) related_works = tools.Static([]) related_agents = tools.Concat( tools.Map( tools.Delegate(osf.Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(osf.Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), )
class ContributorPerson(Parser): schema = 'Person' suffix = tools.ParseName(ctx.contributorName).suffix family_name = tools.ParseName(ctx.contributorName).last given_name = tools.ParseName(ctx.contributorName).first additional_name = tools.ParseName(ctx.contributorName).middle identifiers = tools.Map(tools.Delegate(ThroughIdentifiers), tools.Try(ctx.nameIdentifier)) class Extra: name_identifier = tools.Try( tools.RunPython(force_text, ctx.nameIdentifier)) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType)