def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://api.crossref.org/v1/works?filter=from-pub-date:{},until-pub-date:{}&rows={{}}&offset={{}}'.format( start_date.isoformat(), end_date.isoformat()) total = requests.get(base_url.format( '0', '0')).json()['message']['total-results'] logger.info('{} documents to be harvested'.format(total)) doc_list = [] for i in xrange(0, total, 1000): records = requests.get(base_url.format( 1000, i)).json()['message']['items'] logger.info('Harvested {} documents'.format(i + len(records))) for record in records: doc_id = record['DOI'] doc_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' })) return doc_list
def harvest(self, start_date=None, end_date=None): """ Return a list of RawDocuments """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://exporter.nih.gov/' table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' # get ExPORTER page html and rows storing records html = requests.get(table_url).content soup = BeautifulSoup(html, 'lxml') table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData") rows = table.find_all('tr', class_="row_bg") urls = [ i for i in construct_urls(base_url, start_date, end_date, rows) ] return [ RawDocument({ 'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING), 'source': self.short_name, 'docID': copy_to_unicode( record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]), 'filetype': 'xml' }) for record in xml_records(get_xml_files(urls)) ]
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() delta = end_date - start_date date_strings = [] for i in range(delta.days + 1): date_strings.append(start_date + timedelta(days=i)) search_urls = [] for adate in date_strings: self.URL.args['q'] = 'date:{}'.format(adate) search_urls.append(self.URL.url) records = self.get_records(search_urls) records_list = [] for record in records: format_type = record['publisher'] if format_type.lower() != "biomed central": if format_type.lower() != "springer": logger.info( 'Found non-springer source in springer api: {}'.format( format_type)) records_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': record['identifier'], 'filetype': 'json' })) return records_list
def harvest(self, start_date=None, end_date=None): start_date = start_date if start_date else date.today() - timedelta( settings.DAYS_BACK) end_date = end_date - timedelta( 1) if end_date else date.today() - timedelta(1) search_url = '{0}{1}&dateEnd={2}'.format( self.URL, start_date.strftime('%m/%d/%Y'), end_date.strftime('%m/%d/%Y')) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' })) return record_list
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() query = furl.furl(self.URL) query.args['type'] = self.TYPE query.args['per_page'] = self.MAX_ITEMS_PER_REQUEST query.args['key'] = HARVARD_DATAVERSE_API_KEY query.args['sort'] = 'date' query.args['order'] = 'asc' query.args['fq'] = 'dateSort:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date, end_date) records = self.get_records(query.url) record_list = [] for record in records: doc_id = record['global_id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' })) return record_list
def consume(days_back=5): today = date.today() start_date = today - timedelta(days_back) url = OAI_DC_BASE_URL + '&metadataPrefix=oai_dc&from=' if 'YYYY-MM-DDThh:mm:ssZ' == 'YYYY-MM-DDThh:mm:ssZ': url += str(start_date) + 'T00:00:00Z' elif 'YYYY-MM-DDThh:mm:ssZ' == 'YYYY-MM-DD hh:mm:ss': url += str(start_date) + ' 00:00:00' else: url += str(start_date) print(url) record_encoding = requests.get(url).encoding records = get_records(url) xml_list = [] for record in records: set_spec = record.xpath('ns0:header/ns0:setSpec/node()', namespaces=NAMESPACES)[0] doc_id = record.xpath('ns0:header/ns0:identifier/node()', namespaces=NAMESPACES)[0] record_string = etree.tostring(record, encoding=record_encoding) xml_list.append( RawDocument({ 'doc': record_string, 'source': NAME, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): # Always harvest a 2 day period starting 2 days back to honor time given # to contributors to cancel a public registration start_date = start_date or date.today() - timedelta(4) end_date = end_date or date.today() - timedelta(2) search_url = self.URL.format(start_date.isoformat(), end_date.isoformat()) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['url'].replace('/', '') record_list.append( RawDocument( { 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' } ) ) return record_list
def harvest(self, start_date=None, end_date=None): """ Figshare should always have a 24 hour delay because they manually go through and check for test projects. Most of them are removed within 24 hours. So, we will shift everything back a day with harvesting to ensure nothing is harvested on the day of. """ start_date = start_date - timedelta(1) if start_date else date.today() - timedelta(1 + settings.DAYS_BACK) end_date = end_date - timedelta(1) if end_date else date.today() - timedelta(1) search_url = '{0}{1}&to_date={2}'.format( self.URL, start_date.isoformat(), end_date.isoformat() ) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['article_id'] record_list.append( RawDocument( { 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' } ) ) return record_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: # This ID is unique per data package, but won't unify multiple packages for the same project doc_id = record.xpath("str[@name='id']")[0].text format_type = record.xpath("str[@name='formatType']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) if format_type.lower() != 'metadata': logger.info( 'Not normalizing record with ID {}, type {}'.format( doc_id, format_type)) else: xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or datetime.date.today() - datetime.timedelta(settings.DAYS_BACK) end_date = end_date or datetime.date.today() shas = fetch_commits(self.BASE_URL, start_date.isoformat(), end_date.isoformat()) files = list(set(chain.from_iterable([ fetch_file_names(self.BASE_COMMIT_URL, sha) for sha in shas]))) files = filter(lambda filename: filename.endswith('.xml'), files) xml_records = [ fetch_xml(self.BASE_DATA_URL, filename) for filename in files ] return [ RawDocument({ 'filetype': 'xml', 'source': self.short_name, 'doc': etree.tostring(record), 'docID': record.xpath('//article-id[@*]')[0].text, }) for record in xml_records ]
def harvest(self, start_date=None, end_date=None): ''' First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information ''' start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' # grab each of those urls for full content xml_list = [] xml_base_url = self.canonical_base_url + '&view=xml' for dataset_id in self.query_by_date(start_date, end_date): try: item_url = str(xml_base_url).format(dataset_id) content = requests.get(item_url, throttle=2) except exceptions.ConnectionError as e: logger.info('Connection error: {}, wait a bit...'.format(e)) time.sleep(30) content = requests.get(item_url) doc = etree.XML(content.content) record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(dataset_id), 'filetype': 'xml', })) return xml_list
def consume(days_back=1): changes_url = 'http://resync.library.cornell.edu/arxiv-all/changelist.xml' changelist = requests.get(changes_url) record_encoding = changelist.encoding changeXML = etree.XML(changelist.content) urls_for_info = changeXML.xpath('//urlset:loc/node()', namespaces=NAMESPACES) export_base = 'http://export.arxiv.org/api/query?search_query=' xml_list = [] print len(urls_for_info) for url in urls_for_info: try: # matches everything after a slash then 4 numbers, a dot, 4 more numbers arxiv_id = re.search('(?<=/)\d{4}(\.)?\d{4}', url).group(0) except AttributeError: print 'Warning: malformed arxiv ID, skipping entry for {}'.format(url) continue export_url = export_base + arxiv_id record_request = requests.get(export_url) record_encoding = record_request.encoding record = etree.XML(record_request.content) xml_list.append(RawDocument({ 'doc': etree.tostring(record), 'source': NAME, 'docID': copy_to_unicode(arxiv_id), 'filetype': 'xml' })) time.sleep(2) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() if self.timezone_granularity: start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' records_url = self.base_url + self.RECORDS_URL request_url = records_url + self.META_PREFIX_DATE.format( start_date, end_date) records = self.get_records(request_url, start_date, end_date) rawdoc_list = [] for record in records: doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text record = etree.tostring(record, encoding=self.record_encoding) rawdoc_list.append( RawDocument({ 'doc': record, 'source': util.copy_to_unicode(self.short_name), 'docID': util.copy_to_unicode(doc_id), 'filetype': 'xml' })) return rawdoc_list
def consume(days_back=1, end_date=None, **kwargs): """A function for querying the SciTech Connect database for raw XML. The XML is chunked into smaller pieces, each representing data about an article/report. If there are multiple pages of results, this function iterates through all the pages.""" TODAY = datetime.date.today() start_date = (TODAY - datetime.timedelta(days_back)).strftime('%m/%d/%Y') base_url = 'http://www.osti.gov/scitech/scitechxml' parameters = kwargs parameters['EntryDateFrom'] = start_date parameters['EntryDateTo'] = end_date parameters['page'] = 0 morepages = 'true' xml_list = [] elements_url = 'http://purl.org/dc/elements/1.1/' while morepages == 'true': xml = requests.get(base_url, params=parameters) #.text record_encoding = xml.encoding xml = xml.text xml_root = etree.XML(xml.encode('utf-8')) for record in xml_root.find('records'): doc_id = record.find(str(etree.QName(elements_url, 'ostiId'))).text, xml_list.append(RawDocument({ 'doc': etree.tostring(record, encoding=record_encoding), 'docID' : copy_to_unicode(doc_id), 'source': NAME, 'filetype': 'xml' })) parameters['page'] += 1 morepages = xml_root.find('records').attrib['morepages'] return xml_list
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() if self.timezone_granularity: start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' url = furl(self.base_url) url.args['verb'] = 'ListRecords' url.args['metadataPrefix'] = 'oai_dc' url.args['from'] = start_date url.args['until'] = end_date records = self.get_records(url.url, start_date, end_date) rawdoc_list = [] for record in records: doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text record = etree.tostring(record, encoding=self.record_encoding) rawdoc_list.append( RawDocument({ 'doc': record, 'source': util.copy_to_unicode(self.short_name), 'docID': util.copy_to_unicode(doc_id), 'filetype': 'xml' })) return rawdoc_list
def consume(days_back=0): start_date = TODAY - timedelta(days_back) oai_dc_request = OAI_DC_BASE_URL + \ '&metadataPrefix=oai_dc&from={}'.format(str(start_date)) record_encoding = requests.get(oai_dc_request).encoding # just for testing print 'oai_dc request: ' + oai_dc_request oai_records = get_records(oai_dc_request) records = oai_records print '{} records collected...'.format(len(records)) xml_list = [] for record in records: # TODO: make lack of contributors continue the loop contributors = record.xpath( '//dc:creator/node()', namespaces=NAMESPACES) # changed if not contributors: continue doc_id = record.xpath( 'ns0:header/ns0:identifier/node()', namespaces=NAMESPACES)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': NAME, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() total = self.get_total(start_date, end_date) logger.info('{} documents to be harvested'.format(total)) doc_list = [] for i in xrange(0, total, 1000): uris = self.get_uris(start_date, end_date, 1000, i) records = self.get_records(uris, mapping.DOCUMENT_MAPPING) logger.info('Harvested {} documents'.format(i + len(records))) for record in records: if 'doi' in record: doc_id = record['doi'] else: doc_id = record['uri'] doc_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' })) return doc_list
def harvest(self, days_back=1): return [ RawDocument({ 'doc': str(TEST_XML_DOC), 'source': 'test', 'filetype': 'XML', 'docID': "1" }) for _ in xrange(days_back) ]
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() return [ RawDocument({ 'doc': json.dumps(record), 'source': record['source'], 'docID': record['docID'], 'filetype': 'json' }) for record in self.get_records(start_date, end_date) ]
def harvest(self, days_back=1): return [ RawDocument({ 'doc': TEST_XML_DOC, 'source': 'test', 'filetype': 'XML', 'docID': "1", 'timestamps': { 'harvestFinished': '2015-03-14T17:05:48+00:00', 'harvestStarted': '2015-03-14T17:05:48+00:00', 'harvestTaskCreated': '2015-03-16T17:05:48+00:00' } }) for _ in xrange(days_back) ]
def consume(days_back=1): start_date = date.today() - timedelta(days_back) base_url = OAI_DC_BASE + '?verb=ListRecords&metadataPrefix=oai_dc&from=' url = base_url + str(start_date) + 'T00:00:00Z' num_approved_records = 0 num_rejected_records = 0 approved_sets = [] rejected_sets = [] records = get_records(url) xml_list = [] for record in records: set_spec = record.xpath('ns0:header/ns0:setSpec/node()', namespaces=NAMESPACES)[0] doc_id = record.xpath('ns0:header/ns0:identifier/node()', namespaces=NAMESPACES)[0] set_spec = record.xpath('ns0:header/ns0:setSpec/node()', namespaces=NAMESPACES)[0] record_string = etree.tostring(record, encoding=record_encoding) if set_spec.replace('publication:', '') in series_name_list: approved_sets.append(set_spec) num_approved_records += 1 else: rejected_sets.append(set_spec) num_rejected_records += 1 xml_list.append( RawDocument({ 'doc': record_string, 'source': NAME, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) print "There were {} approved sets".format(num_approved_records) print "The records were from these approved sets: {}".format( set(approved_sets)) print "There were {} rejected sets".format(num_rejected_records) print "The records were from these rejected sets: {}".format( set(rejected_sets)) return xml_list
def harvest(self, start_date=None, end_date=None): # TODO - stepic has no means of querying by date, we should add handling for the # start and end date once it does. search_url = self.URL records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': ('stepic_doc' + str(doc_id)), 'filetype': 'json' })) return record_list
def consume(days_back=1): doc = get_response(1, days_back) rows = doc.xpath("//result/@numFound")[0] doc = get_response(rows, days_back) records = doc.xpath('//doc') xml_list = [] for record in records: doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=record_encoding) xml_list.append( RawDocument({ 'doc': record, 'source': NAME, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): """Returns a list of Rawdocuments (metadata) Searching by time is not supported by LWBIN CKAN API. all datasets have to be scanned each time. """ base_url = 'http://130.179.67.140/api/3/action/current_package_list_with_resources' records = requests.get(base_url).json()['result'] total = len(records) # Total number of documents logger.info('{} documents to be harvested'.format(total)) return [ RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': record['id'], 'filetype': 'json' }) for record in records ]
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() if not PLOS_API_KEY: return [] return [ RawDocument({ 'filetype': 'xml', 'source': self.short_name, 'doc': etree.tostring(row), 'docID': row.xpath("str[@name='id']")[0].text, }) for row in self.fetch_rows(start_date.isoformat(), end_date.isoformat()) if row.xpath("arr[@name='abstract']") or row.xpath("str[@name='author_display']") ]
def harvest(self, start_date=None, end_date=None): api_url = self.url + 'api/collections/?format=json' record_list = [] while api_url: records = requests.get(api_url).json() for record in records['results']: record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': str(record['id']), 'filetype': 'json' })) api_url = records['next'] return record_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: doc_id = record.xpath("str[@name='id']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() search_url = self.URL.format(start_date.isoformat(), end_date.isoformat()) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['url'].replace('/', '') record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id.decode('utf-8'), 'filetype': 'json' })) return record_list
def harvest(self, start_date=None, end_date=None): # This API does not support date ranges start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) # days_back = the number of days between start_date and now, defaulting to settings.DAYS_BACK days_back = settings.DAYS_BACK search_url = '{0}mod_x_days={1}'.format(self.URL, days_back) record_list = [] for record in self.get_records(search_url): doc_id = record['id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' })) return record_list
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) # Biomed central can only have a start date end_date = date.today() date_number = end_date - start_date search_url = self.URL.format(date_number.days) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['arxId'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' })) return record_list