def _normalize(result, timestamp, registry, manifest): iso_timestamp = timestamp.isoformat() normalized = registry[manifest['directory']]['normalize'](result, timestamp) logger.info('Document {0} normalized successfully'.format(result.get("doc_id"))) doc = process_docs.process(normalized, timestamp) if doc is not None: doc.attributes['source'] = manifest['name'] doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\ .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')), doc.attributes['iso_timestamp'] = str(iso_timestamp) logger.info('Document {0} processed successfully'.format(result.get("doc_id"))) search.update('scrapi', doc.attributes, manifest['directory'], result.get("doc_id")) return doc
def _normalize(result, timestamp, registry, manifest): iso_timestamp = timestamp.isoformat() normalized = registry[manifest['directory']]['normalize'](result, timestamp) logger.info('Document {0} normalized successfully'.format( result.get("doc_id"))) doc = process_docs.process(normalized, timestamp) if doc is not None: doc.attributes['source'] = manifest['name'] doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\ .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')), doc.attributes['iso_timestamp'] = str(iso_timestamp) logger.info('Document {0} processed successfully'.format( result.get("doc_id"))) search.update('scrapi', doc.attributes, manifest['directory'], result.get("doc_id")) return doc
def setUp(self): search.delete_all('test') source = "test" doc_id = 38 doc = { 'title': "TEST PROJECT", 'contributors': ['Me, Myself', 'And I'], 'properties': { 'description': 'science stuff', 'email': 'email stuff' }, 'meta': {}, 'id': doc_id, 'source': source, 'iso_timestamp': datetime.datetime.now().isoformat() } search.update(source, doc, 'article', doc_id)
def migrate(): try: search.delete_all('scrapi') except ElasticHttpNotFoundError: pass for dirname, dirnames, filenames in os.walk('archive/'): if os.path.isfile(dirname + '/normalized.json'): with open(dirname + '/normalized.json') as f: try: doc = json.load(f) except ValueError as e: logger.exception(e) continue try: search.update('scrapi', doc, dirname.split('/')[1], dirname.split('/')[2]) except ElasticHttpError as e: logger.exception(e) continue