class FunderAgent(Parser): schema = tools.GuessAgentType(tools.OneOf(ctx.funderName, ctx.contributorName), default='organization') name = tools.OneOf(ctx.funderName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI( tools.OneOf(ctx.funderIdentifier, tools.RunPython(force_text, ctx.nameIdentifier), tools.Static(None))), exceptions=(ValueError, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) funder_identifier = tools.Try(ctx.funderIdentifier) funder_identifier_type = tools.Try(ctx.funderIdentifierType) contributor_type = tools.Try(ctx.contributorType)
class FundingAgent(Parser): schema = tools.GuessAgentType(ctx.sponsorName, default='organization') name = ctx.sponsorName identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.IRI(tools.Try(ctx.sponsorIdentifier)))
class CreativeWork(Parser): title = ctx.title description = tools.Try(ctx.description) is_deleted = tools.RunPython('_is_deleted', tools.Try(ctx.otherProperties)) date_updated = tools.ParseDate(tools.Try(ctx.providerUpdatedDateTime)) rights = tools.Join(tools.Try(ctx.licenses.uri)) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage(tools.Try(ctx.languages[0]), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), tools.Try(ctx.contributors)), tools.Map(tools.Delegate(Publisher), tools.Try(ctx.publisher)), tools.Map(tools.Delegate(Funder), tools.Try(ctx.sponsorships))) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Map( tools.IRI(), tools.RunPython( 'unique', tools.Concat(tools.Try(ctx.uris.canonicalUri), tools.Try(ctx.uris.providerUris), tools.Try(ctx.uris.descriptorUris), tools.Try(ctx.uris.objectUris))))) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Subjects(tools.Try(ctx.subjects))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.tags), tools.Try(ctx.subjects)) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version) def unique(self, items): return list(sorted(set(items))) def _is_deleted(self, properties): for prop in properties or []: if prop['name'] == 'status': return 'deleted' in prop['properties'].get('status', []) return False
class RelatedWorkIdentifier(Parser): schema = 'WorkIdentifier' uri = tools.IRI(tools.RunPython(force_text, ctx)) class Extra: related_identifier_type = ctx['@relatedIdentifierType'] relation_type = tools.Try(ctx['@relationType']) related_metadata_scheme = tools.Try(ctx['@relatedMetadataScheme']) scheme_URI = tools.Try(ctx['@schemeURI']) scheme_type = tools.Try(ctx['@schemeType'])
class Agent(Parser): schema = tools.GuessAgentType(ctx.name) name = ctx.name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.affiliation)) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Map(tools.IRI(), tools.Try(ctx.sameAs), tools.Try(ctx.email))) class Extra: givenName = tools.Try(ctx.givenName) familyName = tools.Try(ctx.familyName) additonalName = tools.Try(ctx.additionalName) name = tools.Try(ctx.name)
class ContributorAgent(Parser): schema = tools.OneOf( tools.GuessAgentType(tools.RunPython(get_agent_type, ctx, person=False), default='organization'), tools.GuessAgentType(tools.OneOf(ctx.creatorName, ctx.contributorName))) name = tools.OneOf(ctx.creatorName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI(tools.RunPython(force_text, ctx.nameIdentifier)), exceptions=(ValueError, ))) related_agents = tools.Map( tools.Delegate(IsAffiliatedWith), tools.Concat( tools.Try( tools.Filter(lambda x: bool(x), tools.RunPython(force_text, ctx.affiliation))))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class OAICreativeWork(Parser): default_type = None type_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None))) title = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:title']))) description = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:description']))) identifiers = tools.Map( tools.Delegate(OAIWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Filter( not_citation, tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier']))))))) related_works = tools.Concat( tools.Map( tools.Delegate(OAIWorkRelation), tools.Unique( tools.Map(tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.RunPython('get_relation', ctx))))) related_agents = tools.Concat( tools.Map(tools.Delegate(OAICreator), tools.Try(ctx.record.metadata.dc['dc:creator'])), tools.Map(tools.Delegate(OAIContributor), tools.Try(ctx.record.metadata.dc['dc:contributor'])), tools.Map( tools.Delegate(OAIPublisher), tools.RunPython('force_text', tools.Try( ctx.record.metadata.dc['dc:publisher']))), ) rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights'])) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx.record.metadata.dc['dc:language'][0]), ) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))), deep=True)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An agent responsible for making contributions to the resource. contributor = tools.Try(ctx.record.metadata.dc['dc:contributor']) # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Try(ctx.record.metadata.dc['dc:coverage']) # An agent primarily responsible for making the resource. creator = tools.Try(ctx.record.metadata.dc['dc:creator']) # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Try(ctx.record.metadata.dc['dc:date']) # The file format, physical medium, or dimensions of the resource. resource_format = tools.Try(ctx.record.metadata.dc['dc:format']) # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier'])) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Try(ctx.record.metadata.dc['dc:source']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Try(ctx.record.header.setSpec) # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Try(ctx.record.header['@status']) def check_status(self, status): if status == 'deleted': return True return False def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data fixed = [] for datum in (data or []): if datum is None: continue if isinstance(datum, dict): if '#text' not in datum: logger.warn('Skipping %s, no #text key exists', datum) continue fixed.append(datum['#text']) elif isinstance(datum, str): fixed.append(datum) else: raise Exception(datum) return fixed def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation') or [] identifiers = ctx['record']['metadata']['dc'].get( 'dc:identifier') or [] if isinstance(identifiers, dict): identifiers = (identifiers, ) identifiers = ''.join(i['#text'] if isinstance(i, dict) else i for i in identifiers if i) identifiers = re.sub( 'http|:|/', '', identifiers + ctx['record']['header']['identifier']) if isinstance(relation, dict): relation = (relation['#text'], ) return [ r for r in relation if r and re.sub('http|:|/', '', r) not in identifiers ]
class AgentIdentifier(Parser): uri = tools.IRI(ctx)
class WorkIdentifier(Parser): uri = tools.IRI(ctx)
class WorkIdentifier(Parser): uri = tools.IRI(ctx['#text'])
class Award(Parser): name = ctx.awardName uri = tools.IRI(tools.Try(ctx.awardIdentifier))