コード例 #1
0
    def test_process_raw(self):
        raw_file = RawDocument({
            'doc': json.dumps({'Hello':  'world'}),
            'source': "TEST",
            'doc_id': 37,
            'filetype': "json"
        })

        assert process_docs.process_raw(raw_file, 'test-version')

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_file.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                found = True
        assert found
コード例 #2
0
 def test_process_illegal(self):
     with self.assertRaises(MissingAttributeError):
         RawDocument({
             'doc': json.dumps({'Hello': 'world'}),
             'source': 'TEST',
             'filetype': 'json'
         })
コード例 #3
0
    def test_process_legal(self):
        raw_doc = RawDocument({
            'doc': json.dumps({'Hello': 'world'}),
            'source': 'TEST',
            'doc_id': 37,
            'filetype': 'json'
        })
        ts = str(process_docs.process_raw(raw_doc, 'test-version'))
        timestamp = None
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                timestamp = dirname.split('/')[-1]
        assert timestamp == ts

        doc = NormalizedDocument({
            'title': "TEST PROJECT",
            'contributors': [
                {
                    'full_name': 'Me, Myself',
                    'email': '*****@*****.**'
                },
                {
                    'full_name': 'And I',
                    'email': '*****@*****.**'
                }
            ],
            'properties': {
            },
            'meta': {},
            'id': {
                'service_id': raw_doc.get('doc_id'),
                'doi': 'Not available',
                'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')),
            },
            'source': raw_doc.get('source'),
            'timestamp': str(timestamp),
            'tags': ['1', '2', '3'],
            'date_created': str(timestamp),
            'description': 'science stuff',
        })

        assert process_docs.process(doc, timestamp)

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/normalized.json'):
                found = True
        assert found
コード例 #4
0
ファイル: consumer.py プロジェクト: csheldonhess/scrapi
def consume():
    ''' return a list of info including a 'raw' file
    '''
    records = [
        {'author': 'Darth Maul', 'title': 'Facepaint Tips Vol 2', 'id': 1, 'abstract': 'A useful guide to facepaint.'},
        {'author': 'Cody Rhodes', 'title': 'Stardust: The Backstory', 'id': 2, 'abstract': 'Look up to the cosmos! It is the neverending void!'},
        {'author': 'Shawn Michaels', 'title': 'Ducks', 'id': 3, 'abstract': 'All about ducks.'}]

    json_list = []
    for record in records:
        json_list.append(RawDocument({
            'doc': record,
            'source': NAME,
            'doc_id': record['id'],
            'filetype': 'json'
        }))

    return json_list
コード例 #5
0
ファイル: celerytasks.py プロジェクト: csheldonhess/scrapi
def check_archive(directory='', reprocess=False):
    """
        Normalize every non-normalized document in the archive.

        Does a directory walk over the the entire archive/ directory, and requests
        a normalized document for every raw file with no normalized neighbor.
    """
    manifests = {}
    for filename in os.listdir('worker_manager/manifests/'):
        manifest = _load_config('worker_manager/manifests/' + filename)
        manifests[manifest['directory']] = manifest

    for dirname, dirnames, filenames in os.walk('archive/' + directory):
        for filename in filenames:
            if 'raw' in filename and (
                    not (os.path.isfile(dirname + '/normalized.json'))
                    or reprocess):
                timestamp = datetime.datetime.strptime(
                    dirname.split('/')[-1], '%Y-%m-%d %H:%M:%S.%f')
                service = dirname.split('/')[1]
                doc_id = dirname.split('/')[2]
                with open(os.path.join(dirname, filename), 'r') as f:
                    logger.info("worker_manager.consumers.{0}".format(
                        manifests[service]['directory']))
                    consumer_module = importlib.import_module(
                        'worker_manager.consumers.{0}'.format(
                            manifests[service]['directory']))
                    registry = consumer_module.registry
                    raw_file = RawDocument({
                        'doc':
                        f.read(),
                        'doc_id':
                        doc_id,
                        'source':
                        service,
                        'filetype':
                        manifests[service]['file-format'],
                    })
                    try:
                        _normalize(raw_file, timestamp, registry,
                                   manifests[service])
                    except MissingAttributeError as e:
                        logger.exception(e)
コード例 #6
0
def consume(days_back=1):
    base_url = "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=ListRecords"
    start_date = TODAY - timedelta(days_back)
    url = base_url + "&metadataPrefix=pmc&from={}".format(str(start_date))
    print(url)

    records = get_records(url)
    results_list = []
    for record in records:
        doc_id = record.xpath("//ns0:identifier/node()",
                              namespaces=NAMESPACES)[0]
        record = etree.tostring(record)
        record = '<?xml version="1.0" encoding="UTF-8"?>\n' + record
        results_list.append(
            RawDocument({
                'doc': record,
                'source': NAME,
                'doc_id': doc_id,
                'filetype': 'xml'
            }))
    return results_list  # a list of raw documents
コード例 #7
0
def consume(days_back=3):

    start_date = str(date.today() - timedelta(days_back))
    base_url = 'http://vtechworks.lib.vt.edu/oai/request?verb=ListRecords&metadataPrefix=oai_dc&from='
    start_date = TODAY - timedelta(days_back)
    # YYYY-MM-DD hh:mm:ss
    url = base_url + str(start_date) + ' 00:00:00'

    records = get_records(url)

    xml_list = []
    for record in records:
        doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=NAMESPACES)[0].text
        record = etree.tostring(record)
        record = '<?xml version="1.0" encoding="UTF-8"?>\n' + record
        xml_list.append(RawDocument({
                    'doc': record,
                    'source': NAME,
                    'doc_id': doc_id,
                    'filetype': 'xml'
                }))

    return xml_list
コード例 #8
0
def consume(days_back=10):
    base_url = 'http://academiccommons.columbia.edu/catalog/oai?verb=ListRecords&from={}&until={}'
    start_date = str(date.today() - timedelta(days_back)) + 'T00:00:00Z'
    end_date = str(date.today()) + 'T00:00:00Z'
    url = base_url.format(start_date, end_date) + '&metadataPrefix=oai_dc'
    print(url)
    records = get_records(url)

    xml_list = []
    for record in records:
        doc_id = record.xpath('ns0:header/ns0:identifier',
                              namespaces=NAMESPACES)[0].text
        record = etree.tostring(record)
        record = '<?xml version="1.0" encoding="UTF-8"?>\n' + record
        xml_list.append(
            RawDocument({
                'doc': record,
                'source': NAME,
                'doc_id': doc_id,
                'filetype': 'xml'
            }))

    return xml_list