def process(): if request.method == 'POST': doc = json.loads(request.form.get('doc')) timestamp = request.form.get('timestamp') else: doc = json.loads(request.args.get('doc')) timestamp = request.args.get('timestamp') doc['timestamp'] = timestamp processed_doc = process_docs.process(doc, timestamp) search.update('scrapi', doc, doc['source'], doc['id']) return processed_doc
def _normalize(result, timestamp, registry, manifest): iso_timestamp = timestamp.isoformat() normalized = registry[manifest['directory']]['normalize'](result, timestamp) logger.info('Document {0} normalized successfully'.format(result.get("doc_id"))) doc = process_docs.process(normalized, timestamp) if doc is not None: doc.attributes['source'] = manifest['name'] doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\ .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')), doc.attributes['iso_timestamp'] = str(iso_timestamp) logger.info('Document {0} processed successfully'.format(result.get("doc_id"))) search.update('scrapi', doc.attributes, manifest['directory'], result.get("doc_id")) return doc
def test_process_legal(self): raw_doc = RawDocument({ 'doc': json.dumps({'Hello': 'world'}), 'source': 'TEST', 'doc_id': 37, 'filetype': 'json' }) ts = str(process_docs.process_raw(raw_doc, 'test-version')) timestamp = None for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))): if os.path.isfile(dirname + '/raw.json'): timestamp = dirname.split('/')[-1] assert timestamp == ts doc = NormalizedDocument({ 'title': "TEST PROJECT", 'contributors': [ { 'full_name': 'Me, Myself', 'email': '*****@*****.**' }, { 'full_name': 'And I', 'email': '*****@*****.**' } ], 'properties': { }, 'meta': {}, 'id': { 'service_id': raw_doc.get('doc_id'), 'doi': 'Not available', 'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')), }, 'source': raw_doc.get('source'), 'timestamp': str(timestamp), 'tags': ['1', '2', '3'], 'date_created': str(timestamp), 'description': 'science stuff', }) assert process_docs.process(doc, timestamp) found = False for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))): if os.path.isfile(dirname + '/normalized.json'): found = True assert found
def _normalize(result, timestamp, registry, manifest): iso_timestamp = timestamp.isoformat() normalized = registry[manifest['directory']]['normalize'](result, timestamp) logger.info('Document {0} normalized successfully'.format( result.get("doc_id"))) doc = process_docs.process(normalized, timestamp) if doc is not None: doc.attributes['source'] = manifest['name'] doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\ .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')), doc.attributes['iso_timestamp'] = str(iso_timestamp) logger.info('Document {0} processed successfully'.format( result.get("doc_id"))) search.update('scrapi', doc.attributes, manifest['directory'], result.get("doc_id")) return doc