def ingest(file_path): """Simple wrapper to run ingestors on a file. :param file_path: The file path. :type file_path: str :return: Tuple, the ingestor object, its data and detached ingestors data. :rtype: tuple """ manager = Manager({}) return manager.ingest(file_path)
class TestCase(unittest.TestCase): def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = "file" service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = "sqlite://" conn = get_fakeredis() job = Job.create(conn, "test") stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, OP_INGEST) Tags("ingest_cache").delete() self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive def fixture(self, fixture_path): """Returns a fixture path and a dummy entity""" # clear out entities self.manager.entities = [] self.manager.dataset.delete() cur_path = ensure_path(__file__).parent cur_path = cur_path.joinpath("fixtures") path = cur_path.joinpath(fixture_path) entity = self.manager.make_entity("Document") if not path.exists(): raise RuntimeError(path) if path.is_file(): checksum = self.manager.store(path) entity.make_id(path.name, checksum) entity.set("contentHash", checksum) entity.set("fileSize", path.stat().st_size) entity.set("fileName", path.name) else: entity.make_id(fixture_path) return path, entity def get_emitted(self, schema=None): entities = list(self.manager.dataset.iterate()) if schema is not None: entities = [e for e in entities if e.schema.is_a(schema)] return entities def get_emitted_by_id(self, id): return self.manager.dataset.get(id) def assertSuccess(self, entity): self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
class TestCase(unittest.TestCase): def setUp(self): # Force tests to use fake configuration service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() balkhash_settings.BACKEND = 'LEVELDB' balkhash_settings.LEVELDB_PATH = mkdtemp() conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, Stage.INGEST) self.manager = Manager(stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive def fixture(self, fixture_path): """Returns a fixture path and a dummy entity""" # clear out entities self.manager.entities = [] self.manager.dataset.delete() cur_path = ensure_path(__file__).parent cur_path = cur_path.joinpath('fixtures') path = cur_path.joinpath(fixture_path) entity = self.manager.make_entity('Document') if path.is_file(): checksum = self.manager.store(path) entity.make_id(path.name, checksum) entity.set('contentHash', checksum) entity.set('fileSize', path.stat().st_size) entity.set('fileName', path.name) else: entity.make_id(fixture_path) return path, entity def get_emitted(self, schema=None): entities = list(self.manager.dataset.iterate()) if schema is not None: entities = [e for e in entities if e.schema.is_a(schema)] return entities def get_emitted_by_id(self, id): return self.manager.dataset.get(id) def assertSuccess(self, entity): self.assertEqual(entity.first('processingStatus'), self.manager.STATUS_SUCCESS)
def setUp(self): # Force tests to use fake configuration service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() balkhash_settings.BACKEND = 'LEVELDB' balkhash_settings.LEVELDB_PATH = mkdtemp() conn = get_fakeredis() self.queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, 'test') self.manager = Manager(self.queue, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = 'sqlite://' conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, OP_INGEST) self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def handle(self, task): manager = Manager(task.stage, task.context) entity = model.get_proxy(task.payload) log.debug("Ingest: %r", entity) manager.ingest_entity(entity) manager.close() self.dispatch_next(task, manager.emitted)
def _analyze(self, task): entity_ids = task.payload.get('entity_ids') dataset = Manager.get_dataset(task.stage, task.context) analyzer = None for entity in dataset.partials(entity_id=entity_ids): if analyzer is None or analyzer.entity.id != entity.id: if analyzer is not None: analyzer.flush() # log.debug("Analyze: %r", entity) analyzer = Analyzer(dataset, entity) analyzer.feed(entity) if analyzer is not None: analyzer.flush() return entity_ids
def _ingest(self, dataset, task): manager = Manager(dataset, task.stage, task.context) entity = model.get_proxy(task.payload) log.debug("Ingest: %r", entity) try: manager.ingest_entity(entity) finally: manager.close() return manager.emitted
def _ingest_path(db, conn, dataset, path, languages=[]): context = {'languages': languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" context = {'languages': languages} conn = get_redis() queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, dataset) manager = Manager(queue, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def _ingest_path(db, conn, dataset, path, languages=[]): context = {"languages": languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity("Document") checksum = manager.store(path) entity.set("contentHash", checksum) entity.make_id(checksum) entity.set("fileName", path.name) log.info("Queue: %r", entity.to_dict()) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def handle_task(cls, queue, payload, context): queue.task_done() try: manager = Manager(queue, context) entity = model.get_proxy(payload) log.debug("Ingest: %r", entity) manager.ingest_entity(entity) manager.close() cls.handle_done(queue) except (KeyboardInterrupt, SystemExit, RuntimeError): cls.handle_retry(queue, payload, context) cls.handle_done(queue) raise except Exception: cls.handle_retry(queue, payload, context) cls.handle_done(queue) log.exception("Processing failed.")