class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class CreativeWork(Parser): title = ctx.title description = tools.Try(ctx.description) is_deleted = tools.RunPython('_is_deleted', tools.Try(ctx.otherProperties)) date_updated = tools.ParseDate(tools.Try(ctx.providerUpdatedDateTime)) rights = tools.Join(tools.Try(ctx.licenses.uri)) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage(tools.Try(ctx.languages[0]), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), tools.Try(ctx.contributors)), tools.Map(tools.Delegate(Publisher), tools.Try(ctx.publisher)), tools.Map(tools.Delegate(Funder), tools.Try(ctx.sponsorships))) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Map( tools.IRI(), tools.RunPython( 'unique', tools.Concat(tools.Try(ctx.uris.canonicalUri), tools.Try(ctx.uris.providerUris), tools.Try(ctx.uris.descriptorUris), tools.Try(ctx.uris.objectUris))))) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Subjects(tools.Try(ctx.subjects))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.tags), tools.Try(ctx.subjects)) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version) def unique(self, items): return list(sorted(set(items))) def _is_deleted(self, properties): for prop in properties or []: if prop['name'] == 'status': return 'deleted' in prop['properties'].get('status', []) return False
class OAICreativeWork(Parser): default_type = None type_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None))) title = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:title']))) description = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:description']))) identifiers = tools.Map( tools.Delegate(OAIWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Filter( not_citation, tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier']))))))) related_works = tools.Concat( tools.Map( tools.Delegate(OAIWorkRelation), tools.Unique( tools.Map(tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.RunPython('get_relation', ctx))))) related_agents = tools.Concat( tools.Map(tools.Delegate(OAICreator), tools.Try(ctx.record.metadata.dc['dc:creator'])), tools.Map(tools.Delegate(OAIContributor), tools.Try(ctx.record.metadata.dc['dc:contributor'])), tools.Map( tools.Delegate(OAIPublisher), tools.RunPython('force_text', tools.Try( ctx.record.metadata.dc['dc:publisher']))), ) rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights'])) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx.record.metadata.dc['dc:language'][0]), ) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))), deep=True)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An agent responsible for making contributions to the resource. contributor = tools.Try(ctx.record.metadata.dc['dc:contributor']) # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Try(ctx.record.metadata.dc['dc:coverage']) # An agent primarily responsible for making the resource. creator = tools.Try(ctx.record.metadata.dc['dc:creator']) # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Try(ctx.record.metadata.dc['dc:date']) # The file format, physical medium, or dimensions of the resource. resource_format = tools.Try(ctx.record.metadata.dc['dc:format']) # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier'])) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Try(ctx.record.metadata.dc['dc:source']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Try(ctx.record.header.setSpec) # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Try(ctx.record.header['@status']) def check_status(self, status): if status == 'deleted': return True return False def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data fixed = [] for datum in (data or []): if datum is None: continue if isinstance(datum, dict): if '#text' not in datum: logger.warn('Skipping %s, no #text key exists', datum) continue fixed.append(datum['#text']) elif isinstance(datum, str): fixed.append(datum) else: raise Exception(datum) return fixed def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation') or [] identifiers = ctx['record']['metadata']['dc'].get( 'dc:identifier') or [] if isinstance(identifiers, dict): identifiers = (identifiers, ) identifiers = ''.join(i['#text'] if isinstance(i, dict) else i for i in identifiers if i) identifiers = re.sub( 'http|:|/', '', identifiers + ctx['record']['header']['identifier']) if isinstance(relation, dict): relation = (relation['#text'], ) return [ r for r in relation if r and re.sub('http|:|/', '', r) not in identifiers ]
class OAICreativeWork(Parser): schema = 'CreativeWork' ORGANIZATION_KEYWORDS = ('the', 'center') INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'institute') title = tools.Join( tools.RunPython('force_text', tools.Try( ctx['record']['metadata']['dc']['dc:title']))) description = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:description']))) publishers = tools.Map( tools.Delegate( OAIAssociation.using(entity=tools.Delegate(OAIPublisher))), tools.Map(tools.RunPython('force_text'), tools.Try(ctx.record.metadata.dc['dc:publisher']))) rights = tools.Join( tools.Maybe(tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:rights')) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx['record']['metadata']['dc']['dc:language'][0]), ) contributors = tools.Map( tools.Delegate(OAIContributor), tools.RunPython( 'get_contributors', tools.Concat( tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator'), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')), 'contributor')) institutions = tools.Map( tools.Delegate( OAIAssociation.using(entity=tools.Delegate(OAIInstitution))), tools.RunPython( 'get_contributors', tools.Concat( tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator'), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')), 'institution')) organizations = tools.Map( tools.Delegate( OAIAssociation.using(entity=tools.Delegate(OAIOrganization))), tools.RunPython( 'get_contributors', tools.Concat( tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator'), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')), 'organization')) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx['record']['header']['setSpec']), tools.Try(ctx['record']['metadata']['dc']['dc:type']), tools.Try(ctx['record']['metadata']['dc']['dc:format']), tools.Try(ctx['record']['metadata']['dc']['dc:subject']), ))) links = tools.Map( tools.Delegate(OAIThroughLinks), tools.RunPython( 'get_links', tools.Concat( tools.Try(ctx['record']['metadata']['dc']['dc:identifier']), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:relation')))) date_updated = tools.ParseDate(ctx['record']['header']['datestamp']) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An entity responsible for making contributions to the resource. contributor = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor') # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage') # An entity primarily responsible for making the resource. creator = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator') # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date') # The file format, physical medium, or dimensions of the resource. resource_format = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format') # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx['record']['metadata']['dc']['dc:identifier']), tools.Maybe(ctx['record']['header'], 'identifier')) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source') # The topic of the resource. subject = tools.Try(ctx.record.metadata.dc['dc:subject']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Maybe(ctx.record.header, 'setSpec') # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Maybe(ctx.record.header, '@status') def get_links(self, ctx): links = [] for link in ctx: if not link or not isinstance(link, str): continue found_url = URL_REGEX.search(link) if found_url is not None: links.append(found_url.group()) continue found_doi = DOI_REGEX.search(link) if found_doi is not None: found_doi = found_doi.group() if 'dx.doi.org' in found_doi: links.append(found_doi) else: links.append('http://dx.doi.org/{}'.format( found_doi.replace('doi:', ''))) return links def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data fixed = [] for datum in (data or []): if datum is None: continue if isinstance(datum, dict): if '#text' not in datum: logger.warn('Skipping %s, no #text key exists', datum) continue fixed.append(datum['#text']) elif isinstance(datum, str): fixed.append(datum) else: raise Exception(datum) return fixed def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation', []) if isinstance(relation, dict): return relation['#text'] return relation def get_contributors(self, options, entity): """ Returns list of organization, institutions, or contributors names based on entity type. """ options = [o if isinstance(o, str) else o['#text'] for o in options] if entity == 'organization': organizations = [ value for value in options if (value and not self.list_in_string(value, self.INSTITUTION_KEYWORDS) and self.list_in_string(value, self.ORGANIZATION_KEYWORDS)) ] return organizations elif entity == 'institution': institutions = [ value for value in options if (value and self.list_in_string(value, self.INSTITUTION_KEYWORDS)) ] return institutions elif entity == 'contributor': people = [ value for value in options if (value and not self.list_in_string( value, self.INSTITUTION_KEYWORDS) and not self.list_in_string(value, self.ORGANIZATION_KEYWORDS)) ] return people else: return options def list_in_string(self, string, list_): if any(word in string.lower() for word in list_): return True return False
class CreativeWork(Parser): # Schema definitions: http://schema.datacite.org/meta/kernel-3.1/doc/DataCite-MetadataKernel_v3.1.pdf PEOPLE_TYPES = ('ContactPerson', 'DataCurator', 'Editor', 'ProjectLeader', 'ProjectManager', 'ProjectMember', 'RelatedPerson', 'Researcher', 'Supervisor', 'WorkPackageLeader') CHECK_TYPES = ('DataCollector', 'DataManager', 'Producer', 'RightsHolder', 'Sponsor', 'Other') NOT_PEOPLE_TYPES = ('Distributor', 'HostingInstitution', 'RegistrationAgency', 'RegistrationAuthority', 'ResearchGroup') ORGANIZATION_KEYWORDS = (THE_REGEX, 'council', 'center', 'foundation') INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'college', 'institute') title = tools.RunPython( force_text, tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title)) description = tools.RunPython( force_text, tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. descriptions.description[0])) publishers = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Publisher))), tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.publisher)) rights = tools.Try( tools.Join( tools.RunPython( 'text_list', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights)))) language = tools.ParseLanguage( tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.language)) contributors = tools.Concat( tools.Map( tools.Delegate(Creator), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.creators.creator)), 'contributor', 'creator')), tools.Map( tools.Delegate(Contributor), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), 'contributor', 'contributor'))) institutions = tools.Concat( tools.Map( tools.Delegate( Association.using(entity=tools.Delegate(CreatorInstitution))), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.creators.creator)), 'institution', 'creator')), tools.Map( tools.Delegate( Association.using( entity=tools.Delegate(ContributorInstitution))), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), 'institution', 'contributor'))) organizations = tools.Concat( tools.Map( tools.Delegate( Association.using(entity=tools.Delegate(CreatorOrganization))), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.creators.creator)), 'organization', 'creator')), tools.Map( tools.Delegate( Association.using( entity=tools.Delegate(ContributorOrganization))), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), 'organization', 'contributor'))) tags = tools.Map( tools.Delegate(ThroughTags), tools.RunPython( force_text, tools.Concat( tools.Maybe( tools.Maybe(ctx.record, 'metadata')['oai_datacite'], 'type'), tools.RunPython('text_list', (tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject)))), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. formats.format), tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['#text']), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral']), tools.Maybe(ctx.record.header, 'setSpec'), tools.Maybe(ctx.record.header, '@status')))) links = tools.Concat( tools.Map( tools.Delegate(ThroughLinks), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. identifier))), tools.Map( tools.Delegate(ThroughAlternateLinks), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier))), tools.Map( tools.Delegate(ThroughRelatedLinks), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. relatedIdentifiers.relatedIdentifier)))) date_updated = tools.ParseDate(ctx.record.header.datestamp) date_published = tools.ParseDate( tools.RunPython( 'get_date_type', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. dates.date)), 'Issued')) free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights['@rightsURI']) free_to_read_date = tools.ParseDate( tools.RunPython( 'get_date_type', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. dates.date)), 'Available')) funders = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Funder))), tools.RunPython( 'get_contributors', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. contributors.contributor)), 'funder', 'contributor')) venues = tools.Map( tools.Delegate(ThroughVenues), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. geoLocations.geoLocation)) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ status = tools.Maybe(ctx.record.header, '@status') datestamp = tools.ParseDate(ctx.record.header.datestamp) set_spec = tools.Maybe(ctx.record.header, 'setSpec') is_reference_quality = tools.Try( ctx.record.metadata['oai_datacite'].isReferenceQuality) schema_version = tools.Try( ctx.record.metadata['oai_datacite'].schemaVersion) datacentre_symbol = tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol) identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.identifier) alternate_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier) titles = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title) publisher = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.publisher) publication_year = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.publicationYear) subject = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject) resourceType = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.resourceType) sizes = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.size) format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.formats.format) version = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.version) rights = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rights) rightsList = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rightsList) related_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. relatedIdentifiers.relatedIdentifier) description = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.descriptions) dates = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.dates.date) contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor) creators = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.creators) geolocations = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.geoLocations) def get_date_type(self, date_obj, date_type): try: date = None for obj in date_obj: if obj['@dateType'] == date_type: date = obj['#text'] if date and not DATE_REGEX.search(date): return None except KeyError: return None return date def text_list(self, data): text_list = [] if isinstance(data, list): for item in data: if isinstance(item, dict): if '#text' in item: text_list.append(item['#text']) continue elif isinstance(item, str): text_list.append(item) continue logger.warning( '#text is not in {} and it is not a string'.format(item)) return text_list else: raise Exception('{} is not a list.'.format(data)) def get_contributors(self, options, entity, field=None): """ Returns list of organization, institutions, or contributors names based on entity type. """ if entity == 'organization': organizations = [] for value in options: val = self.try_contributor_type(value, self.NOT_PEOPLE_TYPES) if val: if field == 'creator': organizations.append(val[field + 'Name']) else: organizations.append(val) elif (value[field + 'Name'] and not self.list_in_string( value[field + 'Name'], self.INSTITUTION_KEYWORDS) and self.list_in_string(value[field + 'Name'], self.ORGANIZATION_KEYWORDS)): if field == 'creator': organizations.append(value[field + 'Name']) else: organizations.append(value) return organizations elif entity == 'institution': institutions = [] for value in options: val = self.try_contributor_type(value, self.NOT_PEOPLE_TYPES) if val: institutions.append(val) elif (value[field + 'Name'] and self.list_in_string( value[field + 'Name'], self.INSTITUTION_KEYWORDS)): institutions.append(value) return institutions elif entity == 'contributor': people = [] for value in options: val = self.try_contributor_type(value, self.PEOPLE_TYPES) if val: people.append(val) elif (value[field + 'Name'] and not self.list_in_string( value[field + 'Name'], self.INSTITUTION_KEYWORDS) and not self.list_in_string(value[field + 'Name'], self.ORGANIZATION_KEYWORDS)): people.append(value) return people elif entity == 'funder': funders = [] for value in options: val = self.try_contributor_type(value, ['Funder']) if val: funders.append(val) return funders else: return options def try_contributor_type(self, value, target_list_types): try: contrib_type_item = value['@contributorType'] if contrib_type_item in target_list_types: return value return None except KeyError: return None def list_in_string(self, string, list_): for word in list_: if isinstance(word, str): if word in string.lower(): return True else: if word.search(string): return True return False
class CreativeWork(Parser): ''' Documentation for Datacite's metadata: https://schema.labs.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf ''' def get_schema(self, type): return { 'dataset': 'DataSet', 'software': 'Software', 'text/book': 'Book', 'text/book chapter': 'Book', 'text/book prospectus': 'Book', 'text/book series': 'Book', 'text/conference abstract': 'ConferencePaper', 'text/conference paper': 'ConferencePaper', 'text/conference poster': 'Poster', 'text/dissertation': 'Dissertation', 'text/edited book': 'Book', 'text/journal article': 'Article', 'text/journal issue': 'Article', 'text/patent': 'Patent', 'text/report': 'Report', 'text/supervised student publication': 'Thesis', 'text/working paper': 'WorkingPaper' # 'audiovisual': '', # 'collection': '', # 'event': '', # 'image': '', # 'interactiveresource': '', # 'model': '', # 'physicalobject': '', # 'service': '', # 'sound': '', # 'text15': '', # 'workflow': '', # 'text/book review': '', # 'text/conference program': '', # 'text/dictionary entry': '', # 'text/disclosure': '', # 'text/encyclopedia entry': '', # 'text/Funding submission': '', # 'text/license': '', # 'text/magazine article': '', # 'text/manual': '', # 'text/newsletter article': '', # 'text/newspaper article': '', # 'text/online resource': '', # 'text/registered copyright': '', # 'text/research tool': '', # 'text/tenure-promotion': '', # 'text/test': '', # 'text/trademark': '', # 'text/translation': '', # 'text/university academic unit': '', # 'text/website': '', }.get(type.lower()) or 'CreativeWork' schema = tools.RunPython( 'get_schema', tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral'], default='CreativeWork')) title = tools.RunPython( force_text, tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title)) description = tools.RunPython( force_text, tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. descriptions.description[0])) rights = tools.Try( tools.Join( tools.RunPython( 'text_list', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights)))) language = tools.ParseLanguage( tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.language)) related_agents = tools.Concat( tools.Map( tools.Delegate(CreatorRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. creators.creator))), tools.Map( tools.Delegate(ContributorRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. contributors.contributor))), tools.Map( tools.Delegate(PublisherRelation), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. publisher)), tools.Map( tools.Delegate(HostRelation), tools.RunPython( get_contributors, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), ['HostingInstitution'])), # v.3 Funder is a contributor type # v.4 FundingReference replaces funder contributor type tools.Map( tools.Delegate(FunderRelation), tools.RunPython( get_contributors, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), ['Funder'])), tools.Map( tools.Delegate(FunderRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. fundingReference)))) # v.4 New, free text, 'subjectScheme' attribute on subject subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython( 'text_list', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject), )))) tags = tools.Map( tools.Delegate(ThroughTags), tools.RunPython( force_text, tools.Concat( tools.Maybe( tools.Maybe(ctx.record, 'metadata')['oai_datacite'], 'type'), tools.RunPython('text_list', (tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject)))), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. formats.format), tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['#text']), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral']), tools.Maybe(ctx.record.header, 'setSpec'), tools.Maybe(ctx.record.header, '@status')))) identifiers = tools.Concat( tools.Map( tools.Delegate(WorkIdentifier), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. identifier))), tools.Map( tools.Delegate(WorkIdentifier), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier)))) related_works = tools.Concat( tools.Map( tools.Delegate(WorkRelation), tools.RunPython( get_related_works, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.relatedIdentifiers.relatedIdentifier)), False)), tools.Map( tools.Delegate(InverseWorkRelation), tools.RunPython( get_related_works, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.relatedIdentifiers.relatedIdentifier)), True))) date_updated = tools.ParseDate(tools.Try(ctx.record.header.datestamp)) date_published = tools.ParseDate( tools.Try( tools.RunPython( 'get_date_type', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.dates.date), 'Issued'))) free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights['@rightsURI']) free_to_read_date = tools.ParseDate( tools.Try( tools.RunPython( 'get_date_type', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.dates.date), 'Available'))) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ status = tools.Try(ctx.record.header['@status']) datestamp = tools.ParseDate(ctx.record.header.datestamp) set_spec = tools.Try(ctx.record.header.setSpec) is_reference_quality = tools.Try( ctx.record.metadata['oai_datacite'].isReferenceQuality) schema_version = tools.Try( ctx.record.metadata['oai_datacite'].schemaVersion) datacentre_symbol = tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol) identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.identifier) alternate_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier) titles = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title) publisher = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.publisher) publication_year = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.publicationYear) subject = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject) resourceType = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.resourceType) sizes = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.size) format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.formats.format) version = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.version) rights = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rights) rightsList = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rightsList) related_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. relatedIdentifiers.relatedIdentifier) description = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.descriptions) dates = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.dates.date) contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor) creators = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.creators) # v.4 new property geoLocationPolygon, in addition to geoLocationPoint and geoLocationBox geolocations = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.geoLocations) funding_reference = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.fundingReference) def check_status(self, status): if status == 'deleted': return True return False def get_date_type(self, date_obj, date_type): date = None for obj in date_obj: if obj['@dateType'] == date_type: date = obj['#text'] if date and date != '0000': return date # raise KeyError to break TryLink raise KeyError() def text_list(self, data): text_list = [] if isinstance(data, list): for item in data: if isinstance(item, dict): if '#text' in item: text_list.append(item['#text']) continue elif isinstance(item, str): text_list.append(item) continue logger.warning( '#text is not in {} and it is not a string'.format(item)) return text_list else: raise Exception('{} is not a list.'.format(data))