class Link(Parser): url = tools.RunPython('format_doi', ctx) # identifier will always be DOI type = tools.Static('doi') def format_doi(self, doi): return format_doi_as_url(self, doi)
class Link(Parser): url = tools.RunPython('format_url', ctx) type = tools.Static('provider') def format_url(self, ctx): return 'https://www.nsf.gov/awardsearch/showAward?AWD_ID={}'.format( ctx['id'])
class FunderAgent(Parser): schema = tools.GuessAgentType(tools.OneOf(ctx.funderName, ctx.contributorName), default='organization') name = tools.OneOf(ctx.funderName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI( tools.OneOf(ctx.funderIdentifier, tools.RunPython(force_text, ctx.nameIdentifier), tools.Static(None))), exceptions=(ValueError, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) funder_identifier = tools.Try(ctx.funderIdentifier) funder_identifier_type = tools.Try(ctx.funderIdentifierType) contributor_type = tools.Try(ctx.contributorType)
class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class Preprint(Parser): title = tools.Try(ctx['DC.Title']) description = tools.Try(ctx['DC.Description']) # is_deleted date_published = tools.ParseDate(tools.Try(ctx['article:published_time'])) date_updated = tools.ParseDate(tools.Try(ctx['DC.Date'])) # free_to_read_type # free_to_read_date rights = tools.Try(ctx['DC.Rights']) language = tools.Try(ctx['DC.Language']) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Biology'), tools.Subjects(tools.Try(ctx['subject-areas']))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx['category']), tools.Try(ctx['subject-areas'])) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx['og:url']), ctx['citation_public_url'], ctx['citation_doi']) related_agents = tools.Concat( tools.Map(tools.Delegate(Publisher), tools.Try(ctx['DC.Publisher'])), tools.Map(tools.Delegate(Creator), tools.RunPython('get_contributors', ctx))) # related_works class Extra: identifiers = ctx['DC.Identifier'] access_rights = ctx['DC.AccessRights'] def get_contributors(self, link): authors = link.get('citation_author', []) if isinstance( link.get('citation_author', []), list) else [link['citation_author']] institutions = link.get( 'citation_author_institution', []) if isinstance( link.get('citation_author_institution', []), list) else [link['citation_author_institution']] emails = link.get('citation_author_email', []) if isinstance( link.get('citation_author_email', []), list) else [link['citation_author_email']] contribs = [] for author, email, institution in itertools.zip_longest( authors, emails, institutions): contrib = { 'author': author, 'institution': institution, 'email': email, } contribs.append(contrib) return contribs
class Preprint(Parser): title = ctx.item['dc:title'] description = ctx.item.description contributors = tools.Map(tools.Delegate(Contributor), ctx.item['dc:creator']) date_published = ctx.item['dc:date'] publishers = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Publisher))), ctx.item['dc:publisher']) links = tools.Map(tools.Delegate(ThroughLinks), ctx.item['dc:identifier']) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Concat(tools.Static('Biology and life sciences')))
class Preprint(Parser): title = ctx.item['dc:title'] description = ctx.item.description date_published = tools.ParseDate(ctx.item['dc:date']) date_updated = tools.ParseDate(ctx.item['dc:date']) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Concat(tools.Static('Biology')) ) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.item['dc:identifier']) related_agents = tools.Concat( tools.Delegate(Publisher, ctx.item['dc:publisher']), tools.Map(tools.Delegate(Creator), ctx.item['dc:creator']), )
class Preprint(Parser): title = ctx.attributes.title description = ctx.attributes.description contributors = tools.Map(tools.Delegate(Contributor), ctx.contributors) institutions = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Institution))), ctx.embeds.affiliated_institutions.data) # rights = tools.Try(ctx.attributes.node_license) date_updated = tools.ParseDate(ctx.attributes.date_modified) links = tools.Map(tools.Delegate(ThroughLinks), ctx.links.html) tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Engineering and technology')) class Extra: date_created = tools.ParseDate(ctx.attributes.date_created) date_modified = ctx.attributes.date_modified
class CreativeWork(Parser): title = ctx.attributes.title description = ctx.attributes.description is_deleted = tools.Static(False) # date_published = date_updated = tools.ParseDate(ctx.attributes.date_modified) # free_to_read_type = # free_to_read_date = # rights = tools.Try(ctx.attributes.node_license) Doesn't seem to have an useful information # language = identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.html, ctx.links.self) tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags) class Extra: date_created = tools.ParseDate(ctx.attributes.date_created)
class Preprint(osf.Project): description = tools.Try(ctx.attributes.abstract) date_updated = tools.ParseDate(ctx.attributes.date_modified) date_published = tools.ParseDate(ctx.attributes.date_created) # NOTE: OSF has a direct mapping to SHARE's taxonomy. Subjects() is not needed subjects = tools.Map(tools.Delegate(ThroughSubjects), ctx.attributes.subjects) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.self, ctx.links.html, tools.Try(ctx.links.doi)) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.attributes.tags)) rights = tools.Try(ctx.attributes.node_license) related_works = tools.Static([]) related_agents = tools.Concat( tools.Map( tools.Delegate(osf.Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(osf.Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), )
class OAICreativeWork(Parser): default_type = None type_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None))) title = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:title']))) description = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:description']))) identifiers = tools.Map( tools.Delegate(OAIWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Filter( not_citation, tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier']))))))) related_works = tools.Concat( tools.Map( tools.Delegate(OAIWorkRelation), tools.Unique( tools.Map(tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.RunPython('get_relation', ctx))))) related_agents = tools.Concat( tools.Map(tools.Delegate(OAICreator), tools.Try(ctx.record.metadata.dc['dc:creator'])), tools.Map(tools.Delegate(OAIContributor), tools.Try(ctx.record.metadata.dc['dc:contributor'])), tools.Map( tools.Delegate(OAIPublisher), tools.RunPython('force_text', tools.Try( ctx.record.metadata.dc['dc:publisher']))), ) rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights'])) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx.record.metadata.dc['dc:language'][0]), ) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))), deep=True)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An agent responsible for making contributions to the resource. contributor = tools.Try(ctx.record.metadata.dc['dc:contributor']) # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Try(ctx.record.metadata.dc['dc:coverage']) # An agent primarily responsible for making the resource. creator = tools.Try(ctx.record.metadata.dc['dc:creator']) # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Try(ctx.record.metadata.dc['dc:date']) # The file format, physical medium, or dimensions of the resource. resource_format = tools.Try(ctx.record.metadata.dc['dc:format']) # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier'])) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Try(ctx.record.metadata.dc['dc:source']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Try(ctx.record.header.setSpec) # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Try(ctx.record.header['@status']) def check_status(self, status): if status == 'deleted': return True return False def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data fixed = [] for datum in (data or []): if datum is None: continue if isinstance(datum, dict): if '#text' not in datum: logger.warn('Skipping %s, no #text key exists', datum) continue fixed.append(datum['#text']) elif isinstance(datum, str): fixed.append(datum) else: raise Exception(datum) return fixed def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation') or [] identifiers = ctx['record']['metadata']['dc'].get( 'dc:identifier') or [] if isinstance(identifiers, dict): identifiers = (identifiers, ) identifiers = ''.join(i['#text'] if isinstance(i, dict) else i for i in identifiers if i) identifiers = re.sub( 'http|:|/', '', identifiers + ctx['record']['header']['identifier']) if isinstance(relation, dict): relation = (relation['#text'], ) return [ r for r in relation if r and re.sub('http|:|/', '', r) not in identifiers ]
class Preprint(EngrxivPreprint): subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Social and behavioral sciences'))
class Preprint(Parser): title = tools.Try(ctx['DC.Title']) description = tools.Try(ctx['DC.Description']) contributors = tools.Map(tools.Delegate(Contributor), tools.RunPython('get_contributors', ctx)) links = tools.Concat( tools.Map(tools.Delegate(ThroughLinks), tools.Concat(ctx['og:url'], ctx['citation_public_url'])), tools.Map(tools.Delegate(DoiThroughLinks), tools.Concat(ctx['citation_doi']))) publishers = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Publisher))), ctx['DC.Publisher']) date_updated = tools.ParseDate(ctx['DC.Date']) date_published = tools.ParseDate(ctx['article:published_time']) language = tools.Try(ctx['DC.Language']) rights = tools.Try(ctx['DC.Rights']) tags = tools.Map(tools.Delegate(ThroughTags), tools.Concat(tools.Try(ctx['category']))) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Static('Biology and life sciences'), ctx['subject-areas'], ) class Extra: identifiers = ctx['DC.Identifier'] access_rights = ctx['DC.AccessRights'] record_type = ctx['type'] contributors = ctx['DC.Contributor'] citation_author = ctx['citation_author'] citation_author_institution = ctx['citation_author_institution'] citation_author_email = ctx['citation_author_email'] def get_contributors(self, link): authors = link['citation_author'] if isinstance( link['citation_author'], list) else [link['citation_author']] institutions = link['citation_author_institution'] if isinstance( link['citation_author_institution'], list) else [link['citation_author_institution']] emails = link['citation_author_email'] if isinstance( link['citation_author_email'], list) else [link['citation_author_email']] contribs = [] for author, email, institution in itertools.zip_longest( authors, emails, institutions): contrib = { 'author': author, 'institution': institution, 'email': email, } contribs.append(contrib) return contribs
class Link(Parser): url = tools.RunPython('format_link', tools.RunPython(force_text, ctx)) type = tools.Static('doi') def format_link(self, link): return format_doi_as_url(self, link)
class AlternateLink(Parser): schema = 'Link' url = tools.RunPython(force_text, ctx) type = tools.Static('misc')
class Link(Parser): url = ctx type = tools.Static('provider')
class Identifier(Parser): url = ctx base_url = tools.Static('https://osf.io/')
class Preprint(normalizer.Project): subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Concat(tools.Static({'text': 'Social and behavioral sciences'})))