Example #1
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}'

        url = base_url.format(start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces)

        xml_list = []
        for record in records:
            doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()',
                                  namespaces=self.namespaces)[0]
            record = etree.tostring(record)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Example #2
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."

    for doc in docs:
        new_doc = copy.deepcopy(doc.raw.attributes)
        new_doc['source'] = target

        raw = RawDocument(new_doc, validate=False)

        assert doc.raw.attributes[
            'source'] != target, "Can't rename {} to {}, names are the same.".format(
                doc.raw['source'], target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(
                doc.raw.attributes['source'], raw['docID']))

            es_processor = get_processor('elasticsearch')
            es_processor.manager.es.delete(
                index=settings.ELASTIC_INDEX,
                doc_type=doc.raw.attributes['source'],
                id=raw['docID'],
                ignore=[404])
            es_processor.manager.es.delete(
                index='share_v1',
                doc_type=doc.raw.attributes['source'],
                id=raw['docID'],
                ignore=[404])

        logger.info('Renamed document from {} to {} with id {}'.format(
            doc.raw.attributes['source'], target, raw['docID']))
Example #3
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
Example #4
0
def raw_doc():
    return RawDocument({
        'doc': 'bar',
        'docID': u'foo',
        'source': u'test',
        'filetype': u'xml',
    })
Example #5
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."
    for doc in docs:
        raw = RawDocument({
            'doc': doc.doc,
            'docID': doc.docID,
            'source': target,
            'filetype': doc.filetype,
            'timestamps': doc.timestamps,
            'versions': doc.versions
        })

        assert doc.source != target, "Can't rename {} to {}, names are the same.".format(
            doc.source, target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(
                doc.source, raw['docID']))

            es.delete(index=settings.ELASTIC_INDEX,
                      doc_type=doc.source,
                      id=raw['docID'],
                      ignore=[404])
            es.delete(index='share_v1',
                      doc_type=doc.source,
                      id=raw['docID'],
                      ignore=[404])

        logger.info('Deleted document from {} with id {}'.format(
            doc.source, raw['docID']))
Example #6
0
 def harvest(self, days_back=1):
     return [
         RawDocument({
             'doc': str(TEST_XML_DOC),
             'source': 'test',
             'filetype': 'XML',
             'docID': "1"
         }) for _ in xrange(days_back)
     ]
Example #7
0
def raw_docs():
    return [
        RawDocument({
            'doc': six.binary_type(x),
            'docID': six.text_type(x),
            'source': u'test',
            'filetype': u'xml',
        }) for x in xrange(11)
    ]
Example #8
0
def raw_docs():
    return [
        RawDocument({
            'doc': str(x).encode('utf-8'),
            'docID': six.text_type(x),
            'source': u'test',
            'filetype': u'xml',
        })
        for x in xrange(11)
    ]
Example #9
0
def raw_docs():
    return [
        RawDocument({
            'doc': str(x),
            'docID': unicode(x),
            'source': u'test',
            'filetype': u'xml',
        })
        for x in xrange(11)
    ]
Example #10
0
def renormalize(doc, **kwargs):
    raw = RawDocument({
        'doc': doc.doc,
        'docID': doc.docID,
        'source': doc.source,
        'filetype': doc.filetype,
        'timestamps': doc.timestamps,
        'versions': doc.versions
    })
    if not kwargs.get('dry'):
        tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
Example #11
0
 def to_raw(self, doc):
     return RawDocument(
         {
             'doc': doc.doc,
             'docID': doc.docID,
             'source': doc.source,
             'filetype': doc.filetype,
             'timestamps': doc.timestamps
         },
         validate=False,
         clean=False)
Example #12
0
    def harvest(self, start_date=None, end_date=None):
        start_date = date(2015, 3, 14)
        end_date = date(2015, 3, 16)

        records = self.get_records(request_url, start_date, end_date)

        return [
            RawDocument({
                'doc': TEST_OAI_DOC,
                'source': 'test',
                'filetype': 'XML',
                'docID': "1"
            }) for record in records
        ]
Example #13
0
    def harvest(self, start_date=None, end_date=None):
        """A function for querying the SciTech Connect database for raw XML.
        The XML is chunked into smaller pieces, each representing data
        about an article/report. If there are multiple pages of results,
        this function iterates through all the pages."""

        return [
            RawDocument({
                'source': self.short_name,
                'filetype': self.file_format,
                'doc': etree.tostring(record),
                'docID': record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0].decode('utf-8'),
            })
            for record in self._fetch_records(start_date, end_date)
        ]
Example #14
0
 def harvest(self, days_back=1):
     return [
         RawDocument({
             'doc':
             str(
                 json.dumps({
                     'title': ['Test', 'subtitle'],
                     'subtitle':
                     'This is a  test',
                     'issued': {
                         'date-parts': [['2015', '2', '2']]
                     },
                     'DOI':
                     '10.10123/232ff',
                     'URL':
                     'http://example.com',
                     'author': [{
                         'family': 'Testerson',
                         'given': 'Testy'
                     }, {
                         'family': 'Testerson Jr',
                         'given': 'Test'
                     }],
                     'subject': ['Testing'],
                     'container-title': ['JSON tests'],
                     'reference-count':
                     '7',
                     'update-policy':
                     'No',
                     'deposited': {
                         'timestamp': 'right now'
                     },
                     'trash':
                     ''
                 })),
             'source':
             'test',
             'filetype':
             'json',
             'docID':
             '1'
         })
     ]
Example #15
0
    def harvest(self, mock_requests, start_date=None, end_date=None):
        request_url = 'http://validOAI.edu/?sonofaplumber'
        requests.HarvesterResponse(
            ok=True,
            method='get',
            url=request_url.lower(),
            content=TEST_OAI_DOC,
            content_type="application/XML"
        ).save()

        start_date = date(2015, 3, 14)
        end_date = date(2015, 3, 16)

        records = self.get_records(request_url, start_date, end_date)

        return [RawDocument({
            'doc': TEST_OAI_DOC,
            'source': 'test',
            'filetype': 'XML',
            'docID': "1"
        }) for record in records]
Example #16
0
    def harvest(self, start_date=None, end_date=None):

        url = 'http://dailyssrn.com/rss/rss-all-2.0.xml'

        data = requests.get(url, force=True)
        doc = etree.XML(data.content)

        records = doc.xpath('channel/item')

        xml_list = []
        for record in records:
            doc_id = parse_id_from_url(record.xpath('link/node()'))
            record = etree.tostring(record)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
    def harvest(self, start_date=None, end_date=None):

        start_date = date(2015, 03, 14)
        end_date = date(2015, 03, 16)

        request_url = 'http://validAI.edu/?from={}&to={}'.format(
            start_date, end_date)

        httpretty.register_uri(httpretty.GET,
                               request_url,
                               body=TEST_OAI_DOC,
                               content_type="application/XML")

        records = self.get_records(request_url, start_date, end_date)

        return [
            RawDocument({
                'doc': str(TEST_OAI_DOC),
                'source': 'test',
                'filetype': 'XML',
                'docID': "1"
            }) for record in records
        ]