class ClinicalTrialsHarvester(XMLHarvester): short_name = 'clinicaltrials' long_name = 'ClinicalTrials.gov' url = 'https://clinicaltrials.gov/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None # TODO - clinicaltrials elements have a lot of extra metadata - at some # point in the future we should do a more thorough audit. schema = { "contributors": ('//overall_official/last_name/node()', default_name_parser), "uris": { "canonicalUri": ("//required_header/url/node()", single_result) }, "providerUpdatedDateTime": ("lastchanged_date/node()", compose(datetime_formatter, single_result)), "title": ('//official_title/node()', '//brief_title/node()', lambda x, y: single_result(x) or single_result(y)), "description": ('//brief_summary/textblock/node()', '//brief_summary/textblock/node()', lambda x, y: single_result(x) or single_result(y)), "tags": ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]), "sponsorships": [{ "sponsor": { "sponsorName": ("//sponsors/lead_sponsor/agency/node()", single_result) } }, { "sponsor": { "sponsorName": ("//sponsors/collaborator/agency/node()", single_result) } }], "otherProperties": build_properties( ("serviceID", "//nct_id/node()"), ('oversightAuthority', '//oversight_info/authority/node()'), ('studyDesign', '//study_design/node()'), ('numberOfArms', '//number_of_arms/node()'), ('source', '//source/node()'), ('verificationDate', '//verification_date/node()'), ('lastChanged', '//lastchanged_date/node()'), ('condition', '//condition/node()'), ('verificationDate', '//verification_date/node()'), ('lastChanged', '//lastchanged_date/node()'), ('status', '//status/node()'), ('locationCountries', '//location_countries/country/node()'), ('isFDARegulated', '//is_fda_regulated/node()'), ('isSection801', '//is_section_801/node()'), ('hasExpandedAccess', '//has_expanded_access/node()'), ('leadSponsorAgencyClass', '//lead_sponsor/agency_class/node()'), ('collaborator', '//collaborator/agency/node()'), ('collaboratorAgencyClass', '//collaborator/agency_class/node()'), ('measure', '//primary_outcome/measure/node()'), ('timeFrame', '//primary_outcome/time_frame/node()'), ('safetyIssue', '//primary_outcome/safety_issue/node()'), ('secondaryOutcomes', '//secondary_outcome/measure/node()'), ('enrollment', '//enrollment/node()'), ('armGroup', '//arm_group/arm_group_label/node()'), ('intervention', '//intervention/intervention_type/node()'), ('eligibility', ('//eligibility/node()', compose(lambda x: list(map(element_to_dict, x)), lambda x: list(filter(non_string, x))))), ('link', '//link/url/node()'), ('responsible_party', '//responsible_party/responsible_party_full_name/node()')) } @property def namespaces(self): return None def harvest(self, start_date=None, end_date=None): """ First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() end_month = end_date.strftime('%m') end_day = end_date.strftime('%d') end_year = end_date.strftime('%Y') start_month = start_date.strftime('%m') start_day = start_date.strftime('%d') start_year = start_date.strftime('%Y') base_url = 'http://clinicaltrials.gov/ct2/results?lup_s=' url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\ format(start_month, start_day, start_year, end_month, end_day, end_year) url = base_url + url_end # grab the total number of studies initial_request = requests.get(url) record_encoding = initial_request.encoding initial_request_xml = etree.XML(initial_request.content) count = int(initial_request_xml.xpath('//search_results/@count')[0]) xml_list = [] if int(count) > 0: # get a new url with all results in it url = url + '&count=' + str(count) total_requests = requests.get(url) initial_doc = etree.XML(total_requests.content) # make a list of urls from that full list of studies study_urls = [] for study in initial_doc.xpath('//clinical_study'): study_urls.append( study.xpath('url/node()')[0] + '?displayxml=true') # grab each of those urls for full content logger.info("There are {} urls to harvest - be patient...".format( len(study_urls))) count = 0 official_count = 0 for study_url in study_urls: try: content = requests.get(study_url) except requests.exceptions.ConnectionError as e: logger.info( 'Connection error: {}, wait a bit...'.format(e)) time.sleep(30) continue doc = etree.XML(content.content) record = etree.tostring(doc, encoding=record_encoding) doc_id = doc.xpath('//nct_id/node()')[0] xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml', })) official_count += 1 count += 1 if count % 100 == 0: logger.info( "You've requested {} studies, keep going!".format( official_count)) count = 0 return xml_list
isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'test' } } TEST_SCHEMA = updated_schema( DOESCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), "otherProperties": build_properties( ("title1", ("//dc:title/node()", single_result)), ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())), ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))) }) TEST_NAMESPACES = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcq': 'http://purl.org/dc/terms/' } TEST_XML_DOC = ''' <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/"> <records count="97" morepages="true" start="1" end="10"> <record rownumber="1">
'description': 'This study seeks to understand how humans impact\ the dietary patterns of eight free-ranging vervet monkey\ (Chlorocebus pygerythrus) groups in South Africa using stable\ isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'test' } } TEST_SCHEMA = updated_schema(DOESCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), "otherProperties": build_properties( ("title1", ("//dc:title/node()", single_result)), ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())), ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower())) ) }) TEST_NAMESPACES = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcq': 'http://purl.org/dc/terms/' } TEST_XML_DOC = b''' <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/"> <records count="97" morepages="true" start="1" end="10">
class DataOneHarvester(XMLHarvester): short_name = 'dataone' long_name = 'DataONE: Data Observation Network for Earth' url = 'https://www.dataone.org/' namespaces = {} record_encoding = None schema = { 'otherProperties': build_properties( ('authorGivenName', ("str[@name='authorGivenName']/node()")), ('authorSurName', ("str[@name='authorSurName']/node()")), ('authoritativeMN', ("str[@name='authoritativeMN']/node()")), ('checksum', ("str[@name='checksum']/node()")), ('checksumAlgorithm', ("str[@name='checksumAlgorithm']/node()")), ('dataUrl', ("str[@name='dataUrl']/node()")), ('datasource', ("str[@name='datasource']/node()")), ('dateModified', ("date[@name='dateModified']/node()")), ('datePublished', ("date[@name='datePublished']/node()")), ('dateUploaded', ("date[@name='dateUploaded']/node()")), ('pubDate', ("date[@name='pubDate']/node()")), ('updateDate', ("date[@name='updateDate']/node()")), ('fileID', ("str[@name='fileID']/node()")), ('formatId', ("str[@name='formatId']/node()")), ('formatType', ("str[@name='formatType']/node()")), ('identifier', ("str[@name='identifier']/node()")), ('investigator', "arr[@name='investigator']/str/node()"), ('origin', "arr[@name='origin']/str/node()"), ('isPublic', ("bool[@name='isPublic']/node()")), ('readPermission', "arr[@name='readPermission']/str/node()"), ('replicaMN', "arr[@name='replicaMN']/str/node()"), ('replicaVerifiedDate', "arr[@name='replicaVerifiedDate']/date/node()"), ('replicationAllowed', ("bool[@name='replicationAllowed']/node()")), ('numberReplicas', ("int[@name='numberReplicas']/node()")), ('preferredReplicationMN', "arr[@name='preferredReplicationMN']/str/node()"), ('resourceMap', "arr[@name='resourceMap']/str/node()"), ('rightsHolder', ("str[@name='rightsHolder']/node()")), ('scientificName', "arr[@name='scientificName']/str/node()"), ('site', "arr[@name='site']/str/node()"), ('size', ("long[@name='size']/node()")), ('sku', ("str[@name='sku']/node()")), ('isDocumentedBy', "arr[@name='isDocumentedBy']/str/node()"), ('serviceID', "str[@name='id']/node()")), 'freeToRead': { 'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None) }, 'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", process_contributors), 'uris': { 'canonicalUri': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", lambda x, y: y[0] if 'http' in single_result(y) else x[0] if 'http' in single_result(x) else ''), }, 'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]), 'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(lambda x: parse(x).date().isoformat(), single_result)), 'title': ("str[@name='title']/node()", single_result), 'description': ("str[@name='abstract']/node()", single_result) } def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list def get_records(self, start_date, end_date): ''' helper function to get a response from the DataONE API, with the specified number of rows. Returns an etree element with results ''' query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date.isoformat(), end_date.isoformat()) doc = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': 0, 'rows': 1 }) doc = etree.XML(doc.content) rows = int(doc.xpath("//result/@numFound")[0]) n = 0 while n < rows: data = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': n, 'rows': 1000 }) docs = etree.XML(data.content).xpath('//doc') for doc in docs: yield doc n += 1000