def schema(self): return { "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser), "uris": { "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)), "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()", self.construct_descriptor_uris) }, "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)), "title": ('//PROJECT_TITLE/node()', single_result), "tags": ('//PROJECT_TERMSX/TERM/node()'), "otherProperties": build_properties( ("applicationID", "//APPLICATION_ID/node()"), ('activity', '//ACTIVITY/node()'), ('administeringIC', '//ADMINISTERING_IC/node()'), ('arraFunded', '//ARRA_FUNDED/node()'), ('budgetStart', '//BUDGET_START/node()'), ('budgetEnd', '//BUDGET_END/node()'), ('FOANumber', '//FOA_NUMBER/node()'), ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'), ('fundingICs', '//FUNDING_ICs/node()'), ('fiscalYear', '//FY/node()'), ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'), ('organizationCity', '//ORG_CITY/node()'), ('organizationCountry', '//ORG_CONTRY/node()'), ('organizationDistrict', '//ORG_DISTRICT/node()'), ('organizationDUNS', '//ORG_DUNS/node()'), ('organizationDept', '//ORG_DEPT/node()'), ('organizationFIPS', '//ORG_FIPS/node()'), ('organizationState', '//ORG_STATE/node()'), ('organizationZipcode', '//ORG_ZIPCODE/node()'), ('ICName', '//IC_NAME/node()'), ('organizationName', '//ORG_NAME/node()'), ('projectStart', '//PROJECT_START/node()'), ('projectEnd', '//PROJECT_END/node()'), ('PHR', '//PHR/node()'), ('serialNumber', '//SERIAL_NUMBER/node()'), ('studySection', '//STUDY_SECTION/node()'), ('studySectionName', '//STUDY_SECTION_NAME/node()'), ('supportYear', '//SUPPORT_YEAR/node()'), ('suffix', '//SUFFIX/node()'), ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'), ('totalCost', '//TOTAL_COST/node()'), ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'), ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'), ('CFDACode', '//CFDA_CODE/node()'), ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'), ('edInstType', '//ED_INST_TYPE/node()'), ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'), ('fundingMechanism', '//FUNDING_MECHANISM/node()')) }
def schema(self): return { "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser), "uris": { "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)), "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()", self.construct_descriptor_uris) }, "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)), "title": ('//PROJECT_TITLE/node()', single_result), "tags": ('//PROJECT_TERMSX/TERM/node()'), "otherProperties": build_properties( ("applicationID", "//APPLICATION_ID/node()"), ('activity', '//ACTIVITY/node()'), ('administeringIC', '//ADMINISTERING_IC/node()'), ('arraFunded', '//ARRA_FUNDED/node()'), ('budgetStart', '//BUDGET_START/node()'), ('budgetEnd', '//BUDGET_END/node()'), ('FOANumber', '//FOA_NUMBER/node()'), ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'), ('fundingICs', '//FUNDING_ICs/node()'), ('fiscalYear', '//FY/node()'), ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'), ('organizationCity', '//ORG_CITY/node()'), ('organizationCountry', '//ORG_CONTRY/node()'), ('organizationDistrict', '//ORG_DISTRICT/node()'), ('organizationDUNS', '//ORG_DUNS/node()'), ('organizationDept', '//ORG_DEPT/node()'), ('organizationFIPS', '//ORG_FIPS/node()'), ('organizationState', '//ORG_STATE/node()'), ('organizationZipcode', '//ORG_ZIPCODE/node()'), ('ICName', '//IC_NAME/node()'), ('organizationName', '//ORG_NAME/node()'), ('projectStart', '//PROJECT_START/node()'), ('projectEnd', '//PROJECT_END/node()'), ('PHR', '//PHR/node()'), ('serialNumber', '//SERIAL_NUMBER/node()'), ('studySection', '//STUDY_SECTION/node()'), ('studySectionName', '//STUDY_SECTION_NAME/node()'), ('supportYear', '//SUPPORT_YEAR/node()'), ('suffix', '//SUFFIX/node()'), ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'), ('totalCost', '//TOTAL_COST/node()'), ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'), ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'), ('CFDACode', '//CFDA_CODE/node()'), ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'), ('edInstType', '//ED_INST_TYPE/node()'), ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'), ('fundingMechanism', '//FUNDING_MECHANISM/node()') ) }
def schema(self): return { 'contributors': ('/authors', process_contributors), 'uris': { 'objectUris': ('/url', '/full_dataset_url', compose(filter_none, lambda x, y: [x, y])), 'descriptorUris': ('/DOI', '/paper_url', compose(filter_none, lambda x, y: [('http://dx.doi.org/{}'.format(x) if x else None), y])), 'canonicalUri': '/url', }, 'title': '/name', 'providerUpdatedDateTime': ('/modify_date', datetime_formatter), 'description': '/description', 'otherProperties': build_properties( ('owner_name', '/owner_name'), ) }
def schema(self): return helpers.updated_schema(self._schema, { "uris": { "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)), "objectUris": [('//dc:identifier/node()', icpsr_exttract_doi)] } })
def schema(self): return { 'contributors': ('/creators', compose(default_name_parser, lambda authors: [author['creator'] for author in authors])), 'uris': ('/url', process_urls), 'title': '/title', 'providerUpdatedDateTime': ('/publicationDate', datetime_formatter), 'description': '/abstract', 'freeToRead': { 'startDate': ('/openaccess', '/publicationDate', lambda x, y: y if x == 'true' else None) }, 'publisher': { 'name': '/publisher' }, 'subjects': ('/genre', lambda x: [x] if x else []), 'otherProperties': build_properties( ('url', '/url'), ('doi', '/doi'), ('isbn', '/isbn'), ('printIsbn', '/printIsbn'), ('electronicIsbn', '/electronicIsbn'), ('volume', '/volume'), ('number', '/number'), ('startingPage', '/startingPage'), ('copyright', '/copyright'), ('identifier', '/identifier')) }
def schema(self): return { 'contributors': ( '/creators', compose( default_name_parser, lambda authors: [author['creator'] for author in authors] ) ), 'uris': ('/url', process_urls), 'title': '/title', 'providerUpdatedDateTime': ('/publicationDate', datetime_formatter), 'description': '/abstract', 'freeToRead': { 'startDate': ('/openaccess', '/publicationDate', lambda x, y: y if x == 'true' else None) }, 'publisher': { 'name': '/publisher' }, 'subjects': ('/genre', lambda x: [x] if x else []), 'otherProperties': build_properties( ('url', '/url'), ('doi', '/doi'), ('isbn', '/isbn'), ('printIsbn', '/printIsbn'), ('electronicIsbn', '/electronicIsbn'), ('volume', '/volume'), ('number', '/number'), ('startingPage', '/startingPage'), ('copyright', '/copyright'), ('identifier', '/identifier') ) }
def format_property(self, property): if property == "date": fn = compose( lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property ) else: fn = self.resolve_property return (property, ("//dc:{}/node()".format(property), "//ns0:{}/node()".format(property), fn))
def schema(self): return updated_schema(self._schema, { "description": ("//dc:description/node()", get_second_description), "uris": { "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)), "objectUris": ('//dc:identifier/node()', oai_extract_dois) } })
def schema(self): id_stanza = './gmd:identificationInfo/gmd:MD_DataIdentification/' cite_stanza = id_stanza + 'gmd:citation/gmd:CI_Citation/' return { 'title': (cite_stanza + 'gmd:title', compose(xml_text_only, single_result)), 'description': (id_stanza + 'gmd:abstract', compose(xml_text_only, single_result)), 'contributors': (cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty', compose(parse_contributors, filter_to_contributors)), 'uris': { 'canonicalUri': ('./gmd:fileIdentifier', compose(lambda x: str(self.canonical_base_url).format(x), xml_text_only, single_result)), }, 'publisher': ( cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty', compose(extract_organization, single_result, filter_to_publishers), ), 'providerUpdatedDateTime': ('./gmd:dateStamp/gco:DateTime/node()', compose(datetime_formatter, single_result)), 'languages': ('./gmd:language/gmd:LanguageCode', compose(language_codes, xml_text_only_list, coerce_to_list)), 'subjects': (id_stanza + 'gmd:descriptiveKeywords/gmd:MD_Keywords', lambda x: filter_keywords(x)), }
def format_property(self, property): if property == 'date': fn = compose( lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property) else: fn = self.resolve_property return (property, ('//dc:{}/node()'.format(property), '//ns0:{}/node()'.format(property), fn))
class DailyssrnHarvester(XMLHarvester): short_name = 'dailyssrn' long_name = 'Social Science Research Network' url = 'http://papers.ssrn.com/' schema = { "description": ('//description/node()', compose(lambda x: x.strip(), single_result)), "title": ('//title/node()', compose(lambda x: x.strip(), single_result)), "providerUpdatedDateTime": ('//pubDate/node()', compose(lambda x: x.isoformat(), parse, lambda x: x.strip(), single_result)), "contributors": '//contributors/node()', "uris": { "canonicalUri": ('//link/node()', compose(lambda x: x.strip(), single_result)), } } def harvest(self, start_date=None, end_date=None): url = 'http://dailyssrn.com/rss/rss-all-2.0.xml' data = requests.get(url, force=True) doc = etree.XML(data.content) records = doc.xpath('channel/item') xml_list = [] for record in records: doc_id = parse_id_from_url(record.xpath('link/node()')) record = etree.tostring(record) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def schema(self): return helpers.updated_schema( self._schema, { "uris": { "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result)) } })
def schema(self): return { 'title': ('/title', lambda x: x[0] if x else ''), 'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''), 'providerUpdatedDateTime': ('/issued/date-parts', compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))), 'uris': { 'canonicalUri': '/URL' }, 'contributors': ('/author', compose( lambda x: [ process_contributor(*[ '{} {}'.format(entry.get('given'), entry.get('family') ), entry.get('ORCID') ]) for entry in x ], lambda x: x or [])), 'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []), 'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'otherProperties': build_properties( ('journalTitle', '/container-title'), ('volume', '/volume'), ('issue', '/issue'), ('publisher', '/publisher'), ('type', '/type'), ('ISSN', '/ISSN'), ('ISBN', '/ISBN'), ('member', '/member'), ('score', '/score'), ('issued', '/issued'), ('deposited', '/deposited'), ('indexed', '/indexed'), ('page', '/page'), ('issue', '/issue'), ('volume', '/volume'), ('referenceCount', '/reference-count'), ('updatePolicy', '/update-policy'), ('depositedTimestamp', '/deposited/timestamp')) }
def format_property(self, property): if property == 'date': fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property) else: fn = self.resolve_property return (property, ( '//dc:{}/node()'.format(property), '//ns0:{}/node()'.format(property), fn) )
def schema(self): return updated_schema( self._schema, { "description": ("//dc:description/node()", get_second_description), "uris": { "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)), "objectUris": ('//dc:identifier/node()', oai_extract_dois) } })
def schema(self): return { 'contributors': ('/authors', process_contributors), 'uris': { 'objectUris': ('/url', '/full_dataset_url', compose(filter_none, lambda x, y: [x, y])), 'descriptorUris': ('/DOI', '/paper_url', compose( filter_none, lambda x, y: [('http://dx.doi.org/{}'.format(x) if x else None), y])), 'canonicalUri': '/url', }, 'title': '/name', 'providerUpdatedDateTime': ('/modify_date', datetime_formatter), 'description': '/description', 'otherProperties': build_properties( ('owner_name', '/owner_name'), ) }
def schema(self): return helpers.updated_schema( self._schema, { "uris": { "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)), "objectUris": [ ('//dc:identifier/node()', icpsr_exttract_doi) ] } })
def schema(self): return { 'title': ('/title', lambda x: x[0] if x else ''), 'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''), 'providerUpdatedDateTime': ('/issued/date-parts', compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))), 'uris': { 'canonicalUri': '/URL' }, 'contributors': ('/author', compose(lambda x: [ process_contributor(*[ '{} {}'.format(entry.get('given'), entry.get('family')), entry.get('ORCID') ]) for entry in x ], lambda x: x or [])), 'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []), 'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'otherProperties': build_properties( ('journalTitle', '/container-title'), ('volume', '/volume'), ('issue', '/issue'), ('publisher', '/publisher'), ('type', '/type'), ('ISSN', '/ISSN'), ('ISBN', '/ISBN'), ('member', '/member'), ('score', '/score'), ('issued', '/issued'), ('deposited', '/deposited'), ('indexed', '/indexed'), ('page', '/page'), ('issue', '/issue'), ('volume', '/volume'), ('referenceCount', '/reference-count'), ('updatePolicy', '/update-policy'), ('depositedTimestamp', '/deposited/timestamp') ) }
class PubMedCentralHarvester(OAIHarvester): short_name = 'pubmedcentral' long_name = 'PubMed Central' url = 'http://www.ncbi.nlm.nih.gov/pmc/' schema = helpers.updated_schema( schemas.OAISCHEMA, { "uris": { "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result)) } }) base_url = 'http://www.pubmedcentral.nih.gov/oai/oai.cgi' property_list = [ 'type', 'source', 'rights', 'format', 'setSpec', 'date', 'identifier' ]
def schema(self): return { "title": ("/title", lambda x: x[0] if x else ""), "description": ("/subtitle", lambda x: x[0] if (isinstance(x, list) and x) else x or ""), "providerUpdatedDateTime": ( "/issued/date-parts", lambda x: parse(" ".join([str(part) for part in x[0]])).date().isoformat(), ), "uris": {"canonicalUri": "/URL"}, "contributors": ( "/author", compose( lambda x: [ process_contributor( *["{} {}".format(entry.get("given"), entry.get("family")), entry.get("ORCID")] ) for entry in x ], lambda x: x or [], ), ), "sponsorships": ("/funder", lambda x: process_sponsorships(x) if x else []), "otherProperties": build_properties( ("journalTitle", "/container-title"), ("volume", "/volume"), ("tags", ("/subject", "/container-title", lambda x, y: [tag.lower() for tag in (x or []) + (y or [])])), ("issue", "/issue"), ("publisher", "/publisher"), ("type", "/type"), ("ISSN", "/ISSN"), ("ISBN", "/ISBN"), ("member", "/member"), ("score", "/score"), ("issued", "/issued"), ("deposited", "/deposited"), ("indexed", "/indexed"), ("page", "/page"), ("issue", "/issue"), ("volume", "/volume"), ("referenceCount", "/reference-count"), ("updatePolicy", "/update-policy"), ("depositedTimestamp", "/deposited/timestamp"), ), }
def schema(self): return { 'contributors': ('/contributors', process_contributors), 'title': ('/title', lambda x: x or ''), 'providerUpdatedDateTime': ('/date_registered', datetime_formatter), 'description': '/description', 'uris': { 'canonicalUri': ('/url', url_from_guid), 'providerUris': ('/url', compose(coerce_to_list, url_from_guid)) }, 'tags': '/tags', 'otherProperties': build_properties( ('parent_title', '/parent_title'), ('category', '/category'), ('wiki_link', '/wiki_link'), ('is_component', '/is_component'), ('is_registration', '/is_registration'), ('parent_url', '/parent_url'), ('journal Id', '/journal Id') ) }
def schema(self): id_stanza = './gmd:identificationInfo/gmd:MD_DataIdentification/' cite_stanza = id_stanza + 'gmd:citation/gmd:CI_Citation/' return { 'title': (cite_stanza + 'gmd:title', compose(xml_text_only, single_result)), 'description': (id_stanza + 'gmd:abstract', compose(xml_text_only, single_result)), 'contributors': (cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty', compose(parse_contributors, filter_to_contributors)), 'uris': { 'canonicalUri': ( './gmd:fileIdentifier', compose(lambda x: str(self.canonical_base_url).format(x), xml_text_only, single_result) ), }, 'publisher': ( cite_stanza + 'gmd:citedResponsibleParty/gmd:CI_ResponsibleParty', compose(extract_organization, single_result, filter_to_publishers), ), 'providerUpdatedDateTime': ('./gmd:dateStamp/gco:DateTime/node()', compose(datetime_formatter, single_result)), 'languages': ('./gmd:language/gmd:LanguageCode', compose(language_codes, xml_text_only_list, coerce_to_list)), 'subjects': (id_stanza + 'gmd:descriptiveKeywords/gmd:MD_Keywords', lambda x: filter_keywords(x)), }
def schema(self): return helpers.updated_schema(self._schema, { "uris": { "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result)) } })
class ClinicalTrialsHarvester(XMLHarvester): short_name = 'clinicaltrials' long_name = 'ClinicalTrials.gov' url = 'https://clinicaltrials.gov/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None # TODO - clinicaltrials elements have a lot of extra metadata - at some # point in the future we should do a more thorough audit. schema = { "contributors": ('//overall_official/last_name/node()', default_name_parser), "uris": { "canonicalUri": ("//required_header/url/node()", single_result) }, "providerUpdatedDateTime": ("lastchanged_date/node()", compose(datetime_formatter, single_result)), "title": ('//official_title/node()', '//brief_title/node()', lambda x, y: single_result(x) or single_result(y)), "description": ('//brief_summary/textblock/node()', '//brief_summary/textblock/node()', lambda x, y: single_result(x) or single_result(y)), "tags": ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]), "sponsorships": [{ "sponsor": { "sponsorName": ("//sponsors/lead_sponsor/agency/node()", single_result) } }, { "sponsor": { "sponsorName": ("//sponsors/collaborator/agency/node()", single_result) } }], "otherProperties": build_properties( ("serviceID", "//nct_id/node()"), ('oversightAuthority', '//oversight_info/authority/node()'), ('studyDesign', '//study_design/node()'), ('numberOfArms', '//number_of_arms/node()'), ('source', '//source/node()'), ('verificationDate', '//verification_date/node()'), ('lastChanged', '//lastchanged_date/node()'), ('condition', '//condition/node()'), ('verificationDate', '//verification_date/node()'), ('lastChanged', '//lastchanged_date/node()'), ('status', '//status/node()'), ('locationCountries', '//location_countries/country/node()'), ('isFDARegulated', '//is_fda_regulated/node()'), ('isSection801', '//is_section_801/node()'), ('hasExpandedAccess', '//has_expanded_access/node()'), ('leadSponsorAgencyClass', '//lead_sponsor/agency_class/node()'), ('collaborator', '//collaborator/agency/node()'), ('collaboratorAgencyClass', '//collaborator/agency_class/node()'), ('measure', '//primary_outcome/measure/node()'), ('timeFrame', '//primary_outcome/time_frame/node()'), ('safetyIssue', '//primary_outcome/safety_issue/node()'), ('secondaryOutcomes', '//secondary_outcome/measure/node()'), ('enrollment', '//enrollment/node()'), ('armGroup', '//arm_group/arm_group_label/node()'), ('intervention', '//intervention/intervention_type/node()'), ('eligibility', ('//eligibility/node()', compose(lambda x: list(map(element_to_dict, x)), lambda x: list(filter(non_string, x))))), ('link', '//link/url/node()'), ('responsible_party', '//responsible_party/responsible_party_full_name/node()')) } @property def namespaces(self): return None def harvest(self, start_date=None, end_date=None): """ First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() end_month = end_date.strftime('%m') end_day = end_date.strftime('%d') end_year = end_date.strftime('%Y') start_month = start_date.strftime('%m') start_day = start_date.strftime('%d') start_year = start_date.strftime('%Y') base_url = 'http://clinicaltrials.gov/ct2/results?lup_s=' url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\ format(start_month, start_day, start_year, end_month, end_day, end_year) url = base_url + url_end # grab the total number of studies initial_request = requests.get(url) record_encoding = initial_request.encoding initial_request_xml = etree.XML(initial_request.content) count = int(initial_request_xml.xpath('//search_results/@count')[0]) xml_list = [] if int(count) > 0: # get a new url with all results in it url = url + '&count=' + str(count) total_requests = requests.get(url) initial_doc = etree.XML(total_requests.content) # make a list of urls from that full list of studies study_urls = [] for study in initial_doc.xpath('//clinical_study'): study_urls.append( study.xpath('url/node()')[0] + '?displayxml=true') # grab each of those urls for full content logger.info("There are {} urls to harvest - be patient...".format( len(study_urls))) count = 0 official_count = 0 for study_url in study_urls: try: content = requests.get(study_url) except requests.exceptions.ConnectionError as e: logger.info( 'Connection error: {}, wait a bit...'.format(e)) time.sleep(30) continue doc = etree.XML(content.content) record = etree.tostring(doc, encoding=record_encoding) doc_id = doc.xpath('//nct_id/node()')[0] xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml', })) official_count += 1 count += 1 if count % 100 == 0: logger.info( "You've requested {} studies, keep going!".format( official_count)) count = 0 return xml_list
from datetime import date, timedelta import xmltodict from lxml import etree from scrapi import requests from scrapi import settings from scrapi.base import XMLHarvester from scrapi.util import copy_to_unicode from scrapi.linter.document import RawDocument from scrapi.base.helpers import (compose, single_result, build_properties, datetime_formatter, default_name_parser) logger = logging.getLogger(__name__) element_to_dict = compose(xmltodict.parse, etree.tostring) def non_string(item): return not isinstance(item, str) class ClinicalTrialsHarvester(XMLHarvester): short_name = 'clinicaltrials' long_name = 'ClinicalTrials.gov' url = 'https://clinicaltrials.gov/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None
class DataOneHarvester(XMLHarvester): short_name = 'dataone' long_name = 'DataONE: Data Observation Network for Earth' url = 'https://www.dataone.org/' namespaces = {} record_encoding = None schema = { 'otherProperties': build_properties( ('authorGivenName', ("str[@name='authorGivenName']/node()")), ('authorSurName', ("str[@name='authorSurName']/node()")), ('authoritativeMN', ("str[@name='authoritativeMN']/node()")), ('checksum', ("str[@name='checksum']/node()")), ('checksumAlgorithm', ("str[@name='checksumAlgorithm']/node()")), ('datasource', ("str[@name='datasource']/node()")), ('datePublished', ("date[@name='datePublished']/node()")), ('dateUploaded', ("date[@name='dateUploaded']/node()")), ('pubDate', ("date[@name='pubDate']/node()")), ('updateDate', ("date[@name='updateDate']/node()")), ('fileID', ("str[@name='fileID']/node()")), ('formatId', ("str[@name='formatId']/node()")), ('formatType', ("str[@name='formatType']/node()")), ('identifier', ("str[@name='identifier']/node()")), ('readPermission', "arr[@name='readPermission']/str/node()"), ('replicaMN', "arr[@name='replicaMN']/str/node()"), ('replicaVerifiedDate', "arr[@name='replicaVerifiedDate']/date/node()"), ('replicationAllowed', ("bool[@name='replicationAllowed']/node()")), ('numberReplicas', ("int[@name='numberReplicas']/node()")), ('preferredReplicationMN', "arr[@name='preferredReplicationMN']/str/node()"), ('rightsHolder', ("str[@name='rightsHolder']/node()")), ('scientificName', "arr[@name='scientificName']/str/node()"), ('site', "arr[@name='site']/str/node()"), ('size', ("long[@name='size']/node()")), ('isDocumentedBy', "arr[@name='isDocumentedBy']/str/node()"), ('serviceID', "str[@name='id']/node()"), ('sku', "str[@name='sku']/node()")), 'freeToRead': { 'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None) }, 'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors), 'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)), 'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]), 'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(datetime_formatter, single_result)), 'title': ("str[@name='title']/node()", single_result), 'description': ("str[@name='abstract']/node()", single_result) } def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: # This ID is unique per data package, but won't unify multiple packages for the same project doc_id = record.xpath("str[@name='id']")[0].text format_type = record.xpath("str[@name='formatType']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) if format_type.lower() != 'metadata': logger.info( 'Not normalizing record with ID {}, type {}'.format( doc_id, format_type)) else: xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list def get_records(self, start_date, end_date): ''' helper function to get a response from the DataONE API, with the specified number of rows. Returns an etree element with results ''' query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date.isoformat(), end_date.isoformat()) doc = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': 0, 'rows': 1 }) doc = etree.XML(doc.content) rows = int(doc.xpath("//result/@numFound")[0]) n = 0 while n < rows: data = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': n, 'rows': 1000 }) docs = etree.XML(data.content).xpath('//doc') for doc in docs: yield doc n += 1000
from scrapi import settings from scrapi.base import XMLHarvester from scrapi.util import copy_to_unicode from scrapi.linter.document import RawDocument from scrapi.base.helpers import ( compose, single_result, build_properties, datetime_formatter, default_name_parser ) logger = logging.getLogger(__name__) element_to_dict = compose(xmltodict.parse, etree.tostring) def non_string(item): return not isinstance(item, str) class ClinicalTrialsHarvester(XMLHarvester): short_name = 'clinicaltrials' long_name = 'ClinicalTrials.gov' url = 'https://clinicaltrials.gov/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None
class PlosHarvester(XMLHarvester): short_name = 'plos' long_name = 'Public Library of Science' url = 'http://www.plos.org/' namespaces = {} MAX_ROWS_PER_REQUEST = 999 BASE_URL = 'http://api.plos.org/search' def fetch_rows(self, start_date, end_date): query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date, end_date) resp = requests.get(self.BASE_URL, params={ 'q': query, 'rows': '0', 'api_key': PLOS_API_KEY, }) total_rows = etree.XML(resp.content).xpath('//result/@numFound') total_rows = int(total_rows[0]) if total_rows else 0 current_row = 0 while current_row < total_rows: response = requests.get(self.BASE_URL, throttle=5, params={ 'q': query, 'start': current_row, 'api_key': PLOS_API_KEY, 'rows': self.MAX_ROWS_PER_REQUEST, }) for doc in etree.XML(response.content).xpath('//doc'): yield doc current_row += self.MAX_ROWS_PER_REQUEST def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() if not PLOS_API_KEY: return [] return [ RawDocument({ 'filetype': 'xml', 'source': self.short_name, 'doc': etree.tostring(row), 'docID': row.xpath("str[@name='id']")[0].text, }) for row in self.fetch_rows(start_date.isoformat(), end_date.isoformat()) if row.xpath("arr[@name='abstract']") or row.xpath("str[@name='author_display']") ] schema = { 'uris': { 'canonicalUri': ('//str[@name="id"]/node()', compose('http://dx.doi.org/{}'.format, single_result)), }, 'contributors': ('//arr[@name="author_display"]/str/node()', default_name_parser), 'providerUpdatedDateTime': ('//date[@name="publication_data"]/node()', compose(lambda x: parse(x).date().isoformat(), single_result)), 'title': ('//str[@name="title_display"]/node()', single_result), 'description': ('//arr[@name="abstract"]/str/node()', single_result), 'publisher': { 'name': ('//str[@name="journal"]/node()', single_result) }, 'otherProperties': build_properties(('eissn', '//str[@name="eissn"]/node()'), ('articleType', '//str[@name="article_type"]/node()'), ('score', '//float[@name="score"]/node()')) }
class ELifeHarvester(XMLHarvester): short_name = 'elife' long_name = 'eLife Sciences' url = 'http://elifesciences.org/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None namespaces = {} MAX_ROWS_PER_REQUEST = 999 BASE_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits?' BASE_COMMIT_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits/{}' BASE_DATA_URL = 'https://raw.githubusercontent.com/elifesciences/elife-article-xml/master/{}' def harvest(self, start_date=None, end_date=None): start_date = start_date or datetime.date.today() - datetime.timedelta(settings.DAYS_BACK) end_date = end_date or datetime.date.today() shas = fetch_commits(self.BASE_URL, start_date.isoformat(), end_date.isoformat()) files = list(set(chain.from_iterable([ fetch_file_names(self.BASE_COMMIT_URL, sha) for sha in shas]))) files = filter(lambda filename: filename.endswith('.xml'), files) xml_records = [ fetch_xml(self.BASE_DATA_URL, filename) for filename in files ] return [ RawDocument({ 'filetype': 'xml', 'source': self.short_name, 'doc': etree.tostring(record), 'docID': record.xpath('//article-id[@*]')[0].text, }) for record in xml_records ] schema = { 'uris': { 'canonicalUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format, single_result)), 'objectUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format, single_result)) }, 'contributors': ('//article-meta/contrib-group/contrib/name/*[not(self::suffix)]/node()', elife_name_parser), 'providerUpdatedDateTime': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()', compose(datetime_formatter, elife_date_parser)), 'title': ('//article-meta/title-group/article-title//text()', collapse_list), 'description': ('//abstract[not(@abstract-type="executive-summary")]/p[1]//text()', collapse_list), 'publisher': { 'name': ('//publisher-name/node()', single_result) }, 'subjects': '//article-meta/article-categories/descendant::text()', 'freeToRead': { 'startDate': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()', elife_date_parser) }, 'tags': '//kwd/text()', 'otherProperties': build_properties( ('rights', ('//permissions/license/license-p/ext-link/text()', single_result)) ) }