def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}' url = base_url.format(start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces) xml_list = [] for record in records: doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0] record = etree.tostring(record) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes[ 'source'] != target, "Can't rename {} to {}, names are the same.".format( doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete( index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete( index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format( doc.raw.attributes['source'], target, raw['docID']))
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}' url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date)) initial_data = requests.get(url) record_encoding = initial_data.encoding initial_doc = etree.XML(initial_data.content) num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0]) url = base_url.format(num_results, start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('records/record') xml_list = [] for record in records: doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def raw_doc(): return RawDocument({ 'doc': 'bar', 'docID': u'foo', 'source': u'test', 'filetype': u'xml', })
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) assert doc.source != target, "Can't rename {} to {}, names are the same.".format( doc.source, target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.source, raw['docID'])) es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=doc.source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format( doc.source, raw['docID']))
def harvest(self, days_back=1): return [ RawDocument({ 'doc': str(TEST_XML_DOC), 'source': 'test', 'filetype': 'XML', 'docID': "1" }) for _ in xrange(days_back) ]
def raw_docs(): return [ RawDocument({ 'doc': six.binary_type(x), 'docID': six.text_type(x), 'source': u'test', 'filetype': u'xml', }) for x in xrange(11) ]
def raw_docs(): return [ RawDocument({ 'doc': str(x).encode('utf-8'), 'docID': six.text_type(x), 'source': u'test', 'filetype': u'xml', }) for x in xrange(11) ]
def raw_docs(): return [ RawDocument({ 'doc': str(x), 'docID': unicode(x), 'source': u'test', 'filetype': u'xml', }) for x in xrange(11) ]
def renormalize(doc, **kwargs): raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': doc.source, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) if not kwargs.get('dry'): tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
def to_raw(self, doc): return RawDocument( { 'doc': doc.doc, 'docID': doc.docID, 'source': doc.source, 'filetype': doc.filetype, 'timestamps': doc.timestamps }, validate=False, clean=False)
def harvest(self, start_date=None, end_date=None): start_date = date(2015, 3, 14) end_date = date(2015, 3, 16) records = self.get_records(request_url, start_date, end_date) return [ RawDocument({ 'doc': TEST_OAI_DOC, 'source': 'test', 'filetype': 'XML', 'docID': "1" }) for record in records ]
def harvest(self, start_date=None, end_date=None): """A function for querying the SciTech Connect database for raw XML. The XML is chunked into smaller pieces, each representing data about an article/report. If there are multiple pages of results, this function iterates through all the pages.""" return [ RawDocument({ 'source': self.short_name, 'filetype': self.file_format, 'doc': etree.tostring(record), 'docID': record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0].decode('utf-8'), }) for record in self._fetch_records(start_date, end_date) ]
def harvest(self, days_back=1): return [ RawDocument({ 'doc': str( json.dumps({ 'title': ['Test', 'subtitle'], 'subtitle': 'This is a test', 'issued': { 'date-parts': [['2015', '2', '2']] }, 'DOI': '10.10123/232ff', 'URL': 'http://example.com', 'author': [{ 'family': 'Testerson', 'given': 'Testy' }, { 'family': 'Testerson Jr', 'given': 'Test' }], 'subject': ['Testing'], 'container-title': ['JSON tests'], 'reference-count': '7', 'update-policy': 'No', 'deposited': { 'timestamp': 'right now' }, 'trash': '' })), 'source': 'test', 'filetype': 'json', 'docID': '1' }) ]
def harvest(self, mock_requests, start_date=None, end_date=None): request_url = 'http://validOAI.edu/?sonofaplumber' requests.HarvesterResponse( ok=True, method='get', url=request_url.lower(), content=TEST_OAI_DOC, content_type="application/XML" ).save() start_date = date(2015, 3, 14) end_date = date(2015, 3, 16) records = self.get_records(request_url, start_date, end_date) return [RawDocument({ 'doc': TEST_OAI_DOC, 'source': 'test', 'filetype': 'XML', 'docID': "1" }) for record in records]
def harvest(self, start_date=None, end_date=None): url = 'http://dailyssrn.com/rss/rss-all-2.0.xml' data = requests.get(url, force=True) doc = etree.XML(data.content) records = doc.xpath('channel/item') xml_list = [] for record in records: doc_id = parse_id_from_url(record.xpath('link/node()')) record = etree.tostring(record) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def harvest(self, start_date=None, end_date=None): start_date = date(2015, 03, 14) end_date = date(2015, 03, 16) request_url = 'http://validAI.edu/?from={}&to={}'.format( start_date, end_date) httpretty.register_uri(httpretty.GET, request_url, body=TEST_OAI_DOC, content_type="application/XML") records = self.get_records(request_url, start_date, end_date) return [ RawDocument({ 'doc': str(TEST_OAI_DOC), 'source': 'test', 'filetype': 'XML', 'docID': "1" }) for record in records ]