class PersonnelAgent(Parser): schema = tools.GuessAgentType( tools.RunPython('combine_first_last_name', ctx)) name = tools.RunPython('combine_first_last_name', ctx) location = tools.RunPython('get_address', ctx['Contact_Address']) class Extra: role = tools.Try(ctx.Role) url = tools.Try(ctx.Data_Center_URL) def combine_first_last_name(self, ctx): return ctx['First_Name'] + ' ' + ctx['Last_Name'] def get_address(self, ctx): address = ctx['Address'] if isinstance(address, list): address1 = address[0] address2 = address[1] return format_address(address1=address1, address2=address2, city=ctx['City'], state_or_province=ctx['Province_or_State'], postal_code=ctx['Postal_Code'], country=ctx['Country']) return format_address(address1=ctx['Address'], address2=address2, city=ctx['City'], state_or_province=ctx['Province_or_State'], postal_code=ctx['Postal_Code'], country=ctx['Country'])
class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text']
class RelatedLink(Parser): schema = 'Link' url = tools.RunPython(force_text, ctx) type = tools.RunPython('lower', tools.Try(ctx['@relatedIdentifierType'])) def lower(self, type): return type.lower()
class Registration(Parser): title = ctx[FIELDS['title']] description = ctx[FIELDS['summary']] date_published = tools.ParseDate( ctx[FIELDS['registration date']].timestamp) date_updated = tools.ParseDate(ctx[FIELDS['registration date']].timestamp) related_agents = tools.Concat( tools.Delegate(PrincipalInvestigator, ctx[FIELDS['primary investigator']]), tools.Delegate(OtherInvestigator, ctx[FIELDS['other investigator']]), tools.Map( tools.Delegate(AdditionalInvestigator), tools.RunPython('split_names', ctx[FIELDS['additional investigators']]))) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.RunPython('get_link', ctx.id)) class Extra: registration_date = ctx[FIELDS['registration date']] questions_and_objectives = ctx[FIELDS['questions and objectives']] study_type = ctx[FIELDS['study type']] study_type_detail = ctx[FIELDS['study type other']] contact_details = ctx[FIELDS['contact details']] participating_institutions = ctx[FIELDS['participating institutions']] countries_of_recruitment = ctx[FIELDS['countries of recruitment']] funders = ctx[FIELDS['funders']] problems_studied = ctx[FIELDS['health conditions or problems studied']] patient_population = ctx[FIELDS['patient population']] interventions = ctx[FIELDS['interventions']] inclusion_criteria = ctx[FIELDS['inclusion criteria']] exclusion_criteria = ctx[FIELDS['exclusion criteria']] control_or_comparators = ctx[FIELDS['control or comparators']] primary_outcomes = ctx[FIELDS['primary outcomes']] key_secondary_outcomes = ctx[FIELDS['key secondary outcomes']] target_sample_size = ctx[FIELDS['target sample size']] recruitment_status = ctx[FIELDS['recruitment status']] other_recruitment_status = ctx[FIELDS['other recruitment status']] first_enrollment_date = ctx[FIELDS['first enrollment date']] expected_enrollment_completion_date = ctx[ FIELDS['expected enrollment completion date']] expected_research_completion_date = ctx[ FIELDS['expected research completion date']] ethical_approval = ctx[FIELDS['ethical approval']] ethical_approval_details = ctx[FIELDS['ethical approval details']] ethical_committee_judgment = ctx[FIELDS['ethical committee judgment']] data = ctx[FIELDS['data']] published_paper = ctx[FIELDS['published paper identifier']] study_website = ctx[FIELDS['study website']] study_results = ctx[FIELDS['study results']] def get_link(self, id): return LINK_FORMAT.format(id) def split_names(self, obj): if not obj: return None return obj.split(',')
class CreativeWork(Parser): title = ctx.title description = tools.Try(ctx.description) is_deleted = tools.RunPython('_is_deleted', tools.Try(ctx.otherProperties)) date_updated = tools.ParseDate(tools.Try(ctx.providerUpdatedDateTime)) rights = tools.Join(tools.Try(ctx.licenses.uri)) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage(tools.Try(ctx.languages[0]), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), tools.Try(ctx.contributors)), tools.Map(tools.Delegate(Publisher), tools.Try(ctx.publisher)), tools.Map(tools.Delegate(Funder), tools.Try(ctx.sponsorships))) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Map( tools.IRI(), tools.RunPython( 'unique', tools.Concat(tools.Try(ctx.uris.canonicalUri), tools.Try(ctx.uris.providerUris), tools.Try(ctx.uris.descriptorUris), tools.Try(ctx.uris.objectUris))))) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Subjects(tools.Try(ctx.subjects))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.tags), tools.Try(ctx.subjects)) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version) def unique(self, items): return list(sorted(set(items))) def _is_deleted(self, properties): for prop in properties or []: if prop['name'] == 'status': return 'deleted' in prop['properties'].get('status', []) return False
class Organization(Parser): schema = tools.GuessAgentType(ctx) name = tools.RunPython('get_name', ctx) location = tools.RunPython('get_location', ctx) def get_name(self, context): return context.split(',')[0] def get_location(self, context): spl = context.partition(',') if len(spl) > 1: return spl[-1] return None
class OAIAgentWorkRelation(Parser): schema = 'AgentWorkRelation' agent = tools.Delegate(OAIAgent, tools.RunPython('force_text', ctx)) cited_as = tools.RunPython('force_text', ctx) def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data raise TypeError(data)
class Extra: name_identifier = tools.Try( tools.RunPython(force_text, ctx.nameIdentifier)) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI'])
class Link(Parser): url = tools.RunPython('format_doi', ctx) # identifier will always be DOI type = tools.Static('doi') def format_doi(self, doi): return format_doi_as_url(self, doi)
class Link(Parser): url = tools.RunPython('format_url', ctx) type = tools.Static('provider') def format_url(self, ctx): return 'https://www.nsf.gov/awardsearch/showAward?AWD_ID={}'.format( ctx['id'])
class FunderAgent(Parser): schema = tools.GuessAgentType(tools.OneOf(ctx.funderName, ctx.contributorName), default='organization') name = tools.OneOf(ctx.funderName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI( tools.OneOf(ctx.funderIdentifier, tools.RunPython(force_text, ctx.nameIdentifier), tools.Static(None))), exceptions=(ValueError, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) funder_identifier = tools.Try(ctx.funderIdentifier) funder_identifier_type = tools.Try(ctx.funderIdentifierType) contributor_type = tools.Try(ctx.contributorType)
class Identifier(Parser): url = ctx base_url = tools.RunPython('get_base_url', ctx) def get_base_url(self, url): url = furl.furl(url) return '{}://{}'.format(url.scheme, url.host)
class Person(Parser): given_name = tools.OneOf( ctx.embeds.users.data.attributes.given_name, ctx.embeds.users.errors[0].meta.given_name, ) family_name = tools.OneOf( ctx.embeds.users.data.attributes.family_name, ctx.embeds.users.errors[0].meta.family_name, ) additional_name = tools.OneOf( ctx.embeds.users.data.attributes.middle_names, ctx.embeds.users.errors[0].meta.middle_names, ) suffix = tools.OneOf( ctx.embeds.users.data.attributes.suffix, ctx.embeds.users.errors[0].meta.suffix, ) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.RunPython('registered', ctx.embeds.users.data.links.html), tools.Try(ctx.embeds.users.data.links.profile_image), ) class Extra: locale = tools.Try(ctx.embeds.users.data.attributes.locale) date_registered = tools.Try( ctx.embeds.users.data.attributes.date_registered) active = tools.Try(ctx.embeds.users.data.attributes.active) timezone = tools.Try(ctx.embeds.users.data.attributes.timezone) def registered(self, context): if self.context['attributes']['unregistered_contributor']: return None return context
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) contributors = tools.Map(tools.Delegate(Contributor), ctx.entry.author) links = tools.Map(tools.Delegate(ThroughLinks), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython('get_subjects', tools.Concat(tools.Try(ctx.entry.category))))) tags = tools.Map(tools.Delegate(ThroughTags), ctx.entry.category) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category']) def get_subjects(self, link): return list(map((lambda category: category['@term']), link))
class Link(Parser): url = tools.RunPython('format_link', ctx) type = tools.RunPython('get_link_type', ctx) def get_link_type(self, link): if 'doi' in link: return 'doi' if self.config.home_page and self.config.home_page in link: return 'provider' return 'misc' def format_link(self, link): link_type = self.get_link_type(link) if link_type == 'doi': return format_doi_as_url(self, link) return link
class Person(Parser): suffix = tools.ParseName( tools.RunPython('combine_first_last_name', ctx) ).suffix family_name = tools.ParseName( tools.RunPython('combine_first_last_name', ctx) ).last given_name = tools.ParseName( tools.RunPython('combine_first_last_name', ctx) ).first additional_name = tools.ParseName( tools.RunPython('combine_first_last_name', ctx) ).middle location = tools.RunPython('get_address', ctx['Contact_Address']) class Extra: role = tools.Maybe(ctx, 'Role') def combine_first_last_name(self, ctx): return ctx['First_Name'] + ' ' + ctx['Last_Name'] def get_address(self, ctx): address = ctx['Address'] if isinstance(address, list): address1 = address[0] address2 = address[1] return format_address( self, address1=address1, address2=address2, city=ctx['City'], state_or_province=ctx['Province_or_State'], postal_code=ctx['Postal_Code'], country=ctx['Country'] ) return format_address( self, address1=ctx['Address'], address2=address2, city=ctx['City'], state_or_province=ctx['Province_or_State'], postal_code=ctx['Postal_Code'], country=ctx['Country'] )
class Venue(Parser): name = tools.Try(tools.RunPython(force_text, ctx.geoLocationPlace)) # polygon = tools.Try(ctx.geoLocationBox) # point = tools.Try(ctx.geoLocationPoint) class Extra: polygon = tools.Try(ctx.geoLocationBox) point = tools.Try(ctx.geoLocationPoint)
class Agent(Parser): schema = tools.RunPython('get_type', ctx) name = Soup(ctx, itemprop='name')['#text'] def get_type(self, obj): return { 'http://schema.org/Person': 'Person', 'http://schema.org/Organization': 'Organization', }[obj.soup['itemtype']]
class Organization(Parser): ORGANIZATION_KEYWORDS = ( 'the', 'center' ) name = tools.RunPython('combine_name', ctx) url = tools.Maybe(ctx, 'Data_Center_URL') # TODO: handle when personnel are organizations affiliations = tools.Map( tools.Delegate(Affiliation), tools.RunPython( 'get_personnel', tools.Maybe(ctx, 'Personnel'), 'person' ) ) def combine_name(self, ctx): return ctx['Data_Center_Name']['Short_Name'] + ' ' + ctx['Data_Center_Name']['Long_Name'] def get_personnel(self, options, entity): """ Returns list based on entity type. """ if not isinstance(options, list): options = [options] if entity == 'person': people = [ value for value in options if ( not self.list_in_string(value['First_Name'], self.ORGANIZATION_KEYWORDS) and not self.list_in_string(value['Last_Name'], self.ORGANIZATION_KEYWORDS) ) ] return people else: return options def list_in_string(self, string, list_): if any(word in string.lower() for word in list_): return True return False
class WorkIdentifier(Parser): uri = tools.RunPython('get_ncar_identifier', ctx) class Extra: description = tools.Try(ctx.Related_URL.Description) url_content_type = tools.Try(ctx.Related_URL.URL_Content_Type.Type) def get_ncar_identifier(self, ctx): return 'https://www.earthsystemgrid.org/dataset/{}.html'.format( ctx['Entry_ID'])
class FunderRelation(Parser): schema = 'Funder' agent = tools.Delegate(FunderAgent, ctx) awards = tools.Map(tools.Delegate(ThroughAwards), tools.Try(tools.RunPython('get_award', ctx))) def get_award(self, obj): obj['awardURI'] return obj
class Link(Parser): url = ctx type = tools.RunPython('get_link_type', ctx) def get_link_type(self, link): if 'dx.doi.org' in link: return 'doi' if 'biorxiv.org' in link: return 'provider' return 'misc'
class Preprint(Parser): title = tools.Try(ctx['DC.Title']) description = tools.Try(ctx['DC.Description']) # is_deleted date_published = tools.ParseDate(tools.Try(ctx['article:published_time'])) date_updated = tools.ParseDate(tools.Try(ctx['DC.Date'])) # free_to_read_type # free_to_read_date rights = tools.Try(ctx['DC.Rights']) language = tools.Try(ctx['DC.Language']) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Biology'), tools.Subjects(tools.Try(ctx['subject-areas']))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx['category']), tools.Try(ctx['subject-areas'])) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx['og:url']), ctx['citation_public_url'], ctx['citation_doi']) related_agents = tools.Concat( tools.Map(tools.Delegate(Publisher), tools.Try(ctx['DC.Publisher'])), tools.Map(tools.Delegate(Creator), tools.RunPython('get_contributors', ctx))) # related_works class Extra: identifiers = ctx['DC.Identifier'] access_rights = ctx['DC.AccessRights'] def get_contributors(self, link): authors = link.get('citation_author', []) if isinstance( link.get('citation_author', []), list) else [link['citation_author']] institutions = link.get( 'citation_author_institution', []) if isinstance( link.get('citation_author_institution', []), list) else [link['citation_author_institution']] emails = link.get('citation_author_email', []) if isinstance( link.get('citation_author_email', []), list) else [link['citation_author_email']] contribs = [] for author, email, institution in itertools.zip_longest( authors, emails, institutions): contrib = { 'author': author, 'institution': institution, 'email': email, } contribs.append(contrib) return contribs
class RelatedWorkIdentifier(Parser): schema = 'WorkIdentifier' uri = tools.IRI(tools.RunPython(force_text, ctx)) class Extra: related_identifier_type = ctx['@relatedIdentifierType'] relation_type = tools.Try(ctx['@relationType']) related_metadata_scheme = tools.Try(ctx['@relatedMetadataScheme']) scheme_URI = tools.Try(ctx['@schemeURI']) scheme_type = tools.Try(ctx['@schemeType'])
class Link(Parser): url = tools.RunPython('format_link', ctx.URL) type = tools.RunPython('get_link_type', ctx.URL) class Extra: description = tools.Maybe(ctx, 'Description') url_content_type = tools.Maybe(ctx.URL_Content_Type, 'Type') def get_link_type(self, link): if 'dx.doi.org' in link: return 'doi' if self.config.home_page and self.config.home_page in link: return 'provider' return 'misc' def format_link(self, link): link_type = self.get_link_type(link) if link_type == 'doi': return format_doi_as_url(self, link) return link
class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An entity responsible for making contributions to the resource. contributor = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor') # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage') # An entity primarily responsible for making the resource. creator = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator') # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date') # The file format, physical medium, or dimensions of the resource. resource_format = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format') # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx['record']['metadata']['dc']['dc:identifier']), tools.Maybe(ctx['record']['header'], 'identifier')) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source') # The topic of the resource. subject = tools.Try(ctx.record.metadata.dc['dc:subject']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Maybe(ctx.record.header, 'setSpec') # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Maybe(ctx.record.header, '@status')
class OAILink(Parser): schema = 'Link' url = tools.RunPython('format_link', ctx) type = tools.RunPython('get_link_type', ctx) # TODO: account for other types of links # i.e. ISBN def get_link_type(self, link): if 'dx.doi.org' in link: return 'doi' if self.config.home_page and self.config.home_page in link: return 'provider' return 'misc' def format_link(self, link): link_type = self.get_link_type(link) if link_type == 'doi': if 'http' in link: return link return format_doi_as_url(self, link) return link
class Award(Parser): name = ctx.title description = ctx.fundsObligatedAmt award_amount = tools.Int(ctx.fundsObligatedAmt) date = tools.ParseDate(ctx.date) uri = tools.RunPython(format_url, ctx.id) class Extra: funds_obligated_amt = ctx.fundsObligatedAmt award_id = ctx.id awardee_name = tools.Try(ctx.awardeeName) awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode) date = ctx.date
class Award(Parser): description = ctx.fundsObligatedAmt url = tools.RunPython('format_url', ctx) def format_url(self, ctx): return 'https://www.nsf.gov/awardsearch/showAward?AWD_ID={}'.format( ctx['id']) class Extra: awardee_city = ctx.awardeeCity funds_obligated_amt = ctx.fundsObligatedAmt name = tools.Try(ctx.awardeeName) awardee_city = ctx.awardeeCity awardee_state_code = tools.Try(ctx.awardeeStateCode)