def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() if self.timezone_granularity: start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' url = furl(self.base_url) url.args['verb'] = 'ListRecords' url.args['metadataPrefix'] = 'oai_dc' url.args['from'] = start_date url.args['until'] = end_date records = self.get_records(url.url, start_date, end_date) rawdoc_list = [] for record in records: doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text record = etree.tostring(record, encoding=self.record_encoding) rawdoc_list.append( RawDocument({ 'doc': record, 'source': util.copy_to_unicode(self.short_name), 'docID': util.copy_to_unicode(doc_id), 'filetype': 'xml' })) return rawdoc_list
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() if self.timezone_granularity: start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' url = furl(self.base_url) url.args['verb'] = 'ListRecords' url.args['metadataPrefix'] = 'oai_dc' url.args['from'] = start_date url.args['until'] = end_date records = self.get_records(url.url, start_date, end_date) rawdoc_list = [] for record in records: doc_id = record.xpath( 'ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text record = etree.tostring(record, encoding=self.record_encoding) rawdoc_list.append(RawDocument({ 'doc': record, 'source': util.copy_to_unicode(self.short_name), 'docID': util.copy_to_unicode(doc_id), 'filetype': 'xml' })) return rawdoc_list
def process_raw(self, raw_doc): self.send_to_database(source=copy_to_unicode(raw_doc['source']), docID=copy_to_unicode(raw_doc['docID']), doc=six.text_type( raw_doc['doc']).encode('utf-8'), filetype=copy_to_unicode(raw_doc['filetype']), timestamps=raw_doc.get('timestamps', {})).save()
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() if self.timezone_granularity: start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' records_url = self.base_url + self.RECORDS_URL request_url = records_url + self.META_PREFIX_DATE.format(start_date, end_date) records = self.get_records(request_url, start_date, end_date) rawdoc_list = [] for record in records: doc_id = record.xpath( 'ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text record = etree.tostring(record, encoding=self.record_encoding) rawdoc_list.append(RawDocument({ 'doc': record, 'source': util.copy_to_unicode(self.short_name), 'docID': util.copy_to_unicode(doc_id), 'filetype': 'xml' })) return rawdoc_list
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() if self.timezone_granularity: start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' records_url = self.base_url + self.RECORDS_URL request_url = records_url + self.META_PREFIX_DATE.format( start_date, end_date) records = self.get_records(request_url, start_date, end_date) rawdoc_list = [] for record in records: doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text record = etree.tostring(record, encoding=self.record_encoding) rawdoc_list.append( RawDocument({ 'doc': record, 'source': util.copy_to_unicode(self.short_name), 'docID': util.copy_to_unicode(doc_id), 'filetype': 'xml' })) return rawdoc_list
def process_raw(self, raw_doc): self.send_to_database( source=copy_to_unicode(raw_doc['source']), docID=copy_to_unicode(raw_doc['docID']), doc=six.text_type(raw_doc['doc']).encode('utf-8'), filetype=copy_to_unicode(raw_doc['filetype']), timestamps=raw_doc.get('timestamps', {}) ).save()
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}' url = base_url.format(start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces) xml_list = [] for record in records: doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0] record = etree.tostring(record) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}' url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date)) initial_data = requests.get(url) record_encoding = initial_data.encoding initial_doc = etree.XML(initial_data.content) num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0]) url = base_url.format(num_results, start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('records/record') xml_list = [] for record in records: doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: # This ID is unique per data package, but won't unify multiple packages for the same project doc_id = record.xpath("str[@name='id']")[0].text format_type = record.xpath("str[@name='formatType']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) if format_type.lower() != 'metadata': logger.info( 'Not normalizing record with ID {}, type {}'.format( doc_id, format_type)) else: xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): """ Return a list of RawDocuments """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://exporter.nih.gov/' table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' # get ExPORTER page html and rows storing records html = requests.get(table_url).content soup = BeautifulSoup(html, 'lxml') table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData") rows = table.find_all('tr', class_="row_bg") urls = [i for i in construct_urls(base_url, start_date, end_date, rows)] return [ RawDocument({ 'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING), 'source': self.short_name, 'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]), 'filetype': 'xml' }) for record in xml_records(get_xml_files(urls)) ]
def harvest(self, start_date=None, end_date=None): """ Return a list of RawDocuments """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://exporter.nih.gov/' table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' # get ExPORTER page html and rows storing records html = requests.get(table_url).content soup = BeautifulSoup(html, 'lxml') table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData") rows = table.find_all('tr', class_="row_bg") urls = [ i for i in construct_urls(base_url, start_date, end_date, rows) ] return [ RawDocument({ 'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING), 'source': self.short_name, 'docID': copy_to_unicode( record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]), 'filetype': 'xml' }) for record in xml_records(get_xml_files(urls)) ]
def harvest(self, start_date=None, end_date=None): ''' First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information ''' start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' # grab each of those urls for full content xml_list = [] xml_base_url = self.canonical_base_url + '&view=xml' for dataset_id in self.query_by_date(start_date, end_date): try: item_url = str(xml_base_url).format(dataset_id) content = requests.get(item_url, throttle=2) except exceptions.ConnectionError as e: logger.info('Connection error: {}, wait a bit...'.format(e)) time.sleep(30) content = requests.get(item_url) doc = etree.XML(content.content) record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(dataset_id), 'filetype': 'xml', })) return xml_list
def harvest(self, start_date=None, end_date=None): ''' First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information ''' start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' # grab each of those urls for full content xml_list = [] xml_base_url = self.canonical_base_url + '&view=xml' for dataset_id in self.query_by_date(start_date, end_date): try: item_url = str(xml_base_url).format(dataset_id) content = requests.get(item_url, throttle=2) except exceptions.ConnectionError as e: logger.info('Connection error: {}, wait a bit...'.format(e)) time.sleep(30) content = requests.get(item_url) doc = etree.XML(content.content) record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(dataset_id), 'filetype': 'xml', })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}' url = base_url.format(start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces) xml_list = [] for record in records: doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0] record = etree.tostring(record) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, days_back=1): today = date.today() start_date = today - timedelta(days_back) base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}' url = base_url.format('1', start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y')) initial_data = requests.get(url) record_encoding = initial_data.encoding initial_doc = etree.XML(initial_data.content) num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0]) url = base_url.format(num_results, start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y')) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('records/record') xml_list = [] for record in records: doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, days_back=1): records = self.get_records(days_back) xml_list = [] for record in records: doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def get_ids(self, result, doc): """ Gather the DOI and url from identifiers, if possible. Tries to save the DOI alone without a url extension. Tries to save a link to the original content at the source, instead of direct to a PDF, which is usually linked with viewcontent.cgi? in the url field """ serviceID = doc.get('docID') url = 'http://core.tdar.org/document/' + serviceID.replace('oai:tdar.org:Resource:', '') return { 'serviceID': serviceID, 'url': util.copy_to_unicode(url), 'doi': '' }
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) xml_list.append( RawDocument( {"doc": record, "source": self.short_name, "docID": copy_to_unicode(doc_id), "filetype": "xml"} ) ) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: # This ID is unique per data package, but won't unify multiple packages for the smae project doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): url = 'http://dailyssrn.com/rss/rss-all-2.0.xml' data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('channel/item') xml_list = [] for record in records: doc_id = parse_id_from_url(record.xpath('link/node()')) record = etree.tostring(record) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): url = 'http://dailyssrn.com/rss/rss-all-2.0.xml' data = requests.get(url, force=True) doc = etree.XML(data.content) records = doc.xpath('channel/item') xml_list = [] for record in records: doc_id = parse_id_from_url(record.xpath('link/node()')) record = etree.tostring(record) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: # This ID is unique per data package, but won't unify multiple packages for the same project doc_id = record.xpath("str[@name='id']")[0].text format_type = record.xpath("str[@name='formatType']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) if format_type.lower() != 'metadata': logger.info('Not normalizing record with ID {}, type {}'.format(doc_id, format_type)) else: xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def process_normalized(self, raw_doc, normalized): self.send_to_database( source=copy_to_unicode(raw_doc['source']), docID=copy_to_unicode(raw_doc['docID']), contributors=copy_to_unicode(json.dumps(normalized['contributors'])), description=copy_to_unicode(normalized.get('description')), uris=copy_to_unicode(json.dumps(normalized['uris'])), providerUpdatedDateTime=parse(normalized['providerUpdatedDateTime']).replace(tzinfo=None), freeToRead=copy_to_unicode(json.dumps(normalized.get('freeToRead', {}))), languages=normalized.get('language'), licenses=copy_to_unicode(json.dumps(normalized.get('licenseRef', []))), publisher=copy_to_unicode(json.dumps(normalized.get('publisher', {}))), sponsorships=copy_to_unicode(json.dumps(normalized.get('sponsorship', []))), title=copy_to_unicode(normalized['title']), version=copy_to_unicode(json.dumps(normalized.get('version'), {})), otherProperties=copy_to_unicode(json.dumps(normalized.get('otherProperties', {}))), shareProperties=copy_to_unicode(json.dumps(normalized['shareProperties'])) ).save()
def harvest(self, start_date=None, end_date=None): """ First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() end_month = end_date.strftime('%m') end_day = end_date.strftime('%d') end_year = end_date.strftime('%Y') start_month = start_date.strftime('%m') start_day = start_date.strftime('%d') start_year = start_date.strftime('%Y') base_url = 'http://clinicaltrials.gov/ct2/results?lup_s=' url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\ format(start_month, start_day, start_year, end_month, end_day, end_year) url = base_url + url_end # grab the total number of studies initial_request = requests.get(url) record_encoding = initial_request.encoding initial_request_xml = etree.XML(initial_request.content) count = int(initial_request_xml.xpath('//search_results/@count')[0]) xml_list = [] if int(count) > 0: # get a new url with all results in it url = url + '&count=' + str(count) total_requests = requests.get(url) initial_doc = etree.XML(total_requests.content) # make a list of urls from that full list of studies study_urls = [] for study in initial_doc.xpath('//clinical_study'): study_urls.append( study.xpath('url/node()')[0] + '?displayxml=true') # grab each of those urls for full content logger.info("There are {} urls to harvest - be patient...".format( len(study_urls))) count = 0 official_count = 0 for study_url in study_urls: try: content = requests.get(study_url) except requests.exceptions.ConnectionError as e: logger.info( 'Connection error: {}, wait a bit...'.format(e)) time.sleep(30) continue doc = etree.XML(content.content) record = etree.tostring(doc, encoding=record_encoding) doc_id = doc.xpath('//nct_id/node()')[0] xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml', })) official_count += 1 count += 1 if count % 100 == 0: logger.info( "You've requested {} studies, keep going!".format( official_count)) count = 0 return xml_list
def harvest(self, start_date=None, end_date=None): """ First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() end_month = end_date.strftime('%m') end_day = end_date.strftime('%d') end_year = end_date.strftime('%Y') start_month = start_date.strftime('%m') start_day = start_date.strftime('%d') start_year = start_date.strftime('%Y') base_url = 'http://clinicaltrials.gov/ct2/results?lup_s=' url_end = '{}%2F{}%2F{}%2F&lup_e={}%2F{}%2F{}&displayxml=true'.\ format(start_month, start_day, start_year, end_month, end_day, end_year) url = base_url + url_end # grab the total number of studies initial_request = requests.get(url) record_encoding = initial_request.encoding initial_request_xml = etree.XML(initial_request.content) count = int(initial_request_xml.xpath('//search_results/@count')[0]) xml_list = [] if int(count) > 0: # get a new url with all results in it url = url + '&count=' + str(count) total_requests = requests.get(url) initial_doc = etree.XML(total_requests.content) # make a list of urls from that full list of studies study_urls = [] for study in initial_doc.xpath('//clinical_study'): study_urls.append(study.xpath('url/node()')[0] + '?displayxml=true') # grab each of those urls for full content logger.info("There are {} urls to harvest - be patient...".format(len(study_urls))) count = 0 official_count = 0 for study_url in study_urls: try: content = requests.get(study_url) except requests.exceptions.ConnectionError as e: logger.info('Connection error: {}, wait a bit...'.format(e)) time.sleep(30) continue doc = etree.XML(content.content) record = etree.tostring(doc, encoding=record_encoding) doc_id = doc.xpath('//nct_id/node()')[0] xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml', })) official_count += 1 count += 1 if count % 100 == 0: logger.info("You've requested {} studies, keep going!".format(official_count)) count = 0 return xml_list
def test_copy_to_unicode(self): converted = util.copy_to_unicode('test') assert converted == u'test' assert isinstance(converted, unicode)