def test_process_raw(self): raw_file = RawDocument({ 'doc': json.dumps({'Hello': 'world'}), 'source': "TEST", 'doc_id': 37, 'filetype': "json" }) assert process_docs.process_raw(raw_file, 'test-version') found = False for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_file.get('doc_id'))): if os.path.isfile(dirname + '/raw.json'): found = True assert found
def test_process_illegal(self): with self.assertRaises(MissingAttributeError): RawDocument({ 'doc': json.dumps({'Hello': 'world'}), 'source': 'TEST', 'filetype': 'json' })
def test_process_legal(self): raw_doc = RawDocument({ 'doc': json.dumps({'Hello': 'world'}), 'source': 'TEST', 'doc_id': 37, 'filetype': 'json' }) ts = str(process_docs.process_raw(raw_doc, 'test-version')) timestamp = None for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))): if os.path.isfile(dirname + '/raw.json'): timestamp = dirname.split('/')[-1] assert timestamp == ts doc = NormalizedDocument({ 'title': "TEST PROJECT", 'contributors': [ { 'full_name': 'Me, Myself', 'email': '*****@*****.**' }, { 'full_name': 'And I', 'email': '*****@*****.**' } ], 'properties': { }, 'meta': {}, 'id': { 'service_id': raw_doc.get('doc_id'), 'doi': 'Not available', 'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')), }, 'source': raw_doc.get('source'), 'timestamp': str(timestamp), 'tags': ['1', '2', '3'], 'date_created': str(timestamp), 'description': 'science stuff', }) assert process_docs.process(doc, timestamp) found = False for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))): if os.path.isfile(dirname + '/normalized.json'): found = True assert found
def consume(): ''' return a list of info including a 'raw' file ''' records = [ {'author': 'Darth Maul', 'title': 'Facepaint Tips Vol 2', 'id': 1, 'abstract': 'A useful guide to facepaint.'}, {'author': 'Cody Rhodes', 'title': 'Stardust: The Backstory', 'id': 2, 'abstract': 'Look up to the cosmos! It is the neverending void!'}, {'author': 'Shawn Michaels', 'title': 'Ducks', 'id': 3, 'abstract': 'All about ducks.'}] json_list = [] for record in records: json_list.append(RawDocument({ 'doc': record, 'source': NAME, 'doc_id': record['id'], 'filetype': 'json' })) return json_list
def check_archive(directory='', reprocess=False): """ Normalize every non-normalized document in the archive. Does a directory walk over the the entire archive/ directory, and requests a normalized document for every raw file with no normalized neighbor. """ manifests = {} for filename in os.listdir('worker_manager/manifests/'): manifest = _load_config('worker_manager/manifests/' + filename) manifests[manifest['directory']] = manifest for dirname, dirnames, filenames in os.walk('archive/' + directory): for filename in filenames: if 'raw' in filename and ( not (os.path.isfile(dirname + '/normalized.json')) or reprocess): timestamp = datetime.datetime.strptime( dirname.split('/')[-1], '%Y-%m-%d %H:%M:%S.%f') service = dirname.split('/')[1] doc_id = dirname.split('/')[2] with open(os.path.join(dirname, filename), 'r') as f: logger.info("worker_manager.consumers.{0}".format( manifests[service]['directory'])) consumer_module = importlib.import_module( 'worker_manager.consumers.{0}'.format( manifests[service]['directory'])) registry = consumer_module.registry raw_file = RawDocument({ 'doc': f.read(), 'doc_id': doc_id, 'source': service, 'filetype': manifests[service]['file-format'], }) try: _normalize(raw_file, timestamp, registry, manifests[service]) except MissingAttributeError as e: logger.exception(e)
def consume(days_back=1): base_url = "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=ListRecords" start_date = TODAY - timedelta(days_back) url = base_url + "&metadataPrefix=pmc&from={}".format(str(start_date)) print(url) records = get_records(url) results_list = [] for record in records: doc_id = record.xpath("//ns0:identifier/node()", namespaces=NAMESPACES)[0] record = etree.tostring(record) record = '<?xml version="1.0" encoding="UTF-8"?>\n' + record results_list.append( RawDocument({ 'doc': record, 'source': NAME, 'doc_id': doc_id, 'filetype': 'xml' })) return results_list # a list of raw documents
def consume(days_back=3): start_date = str(date.today() - timedelta(days_back)) base_url = 'http://vtechworks.lib.vt.edu/oai/request?verb=ListRecords&metadataPrefix=oai_dc&from=' start_date = TODAY - timedelta(days_back) # YYYY-MM-DD hh:mm:ss url = base_url + str(start_date) + ' 00:00:00' records = get_records(url) xml_list = [] for record in records: doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=NAMESPACES)[0].text record = etree.tostring(record) record = '<?xml version="1.0" encoding="UTF-8"?>\n' + record xml_list.append(RawDocument({ 'doc': record, 'source': NAME, 'doc_id': doc_id, 'filetype': 'xml' })) return xml_list
def consume(days_back=10): base_url = 'http://academiccommons.columbia.edu/catalog/oai?verb=ListRecords&from={}&until={}' start_date = str(date.today() - timedelta(days_back)) + 'T00:00:00Z' end_date = str(date.today()) + 'T00:00:00Z' url = base_url.format(start_date, end_date) + '&metadataPrefix=oai_dc' print(url) records = get_records(url) xml_list = [] for record in records: doc_id = record.xpath('ns0:header/ns0:identifier', namespaces=NAMESPACES)[0].text record = etree.tostring(record) record = '<?xml version="1.0" encoding="UTF-8"?>\n' + record xml_list.append( RawDocument({ 'doc': record, 'source': NAME, 'doc_id': doc_id, 'filetype': 'xml' })) return xml_list