class TestCase(unittest.TestCase): def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = "file" service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = "sqlite://" conn = get_fakeredis() job = Job.create(conn, "test") stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, OP_INGEST) Tags("ingest_cache").delete() self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive def fixture(self, fixture_path): """Returns a fixture path and a dummy entity""" # clear out entities self.manager.entities = [] self.manager.dataset.delete() cur_path = ensure_path(__file__).parent cur_path = cur_path.joinpath("fixtures") path = cur_path.joinpath(fixture_path) entity = self.manager.make_entity("Document") if not path.exists(): raise RuntimeError(path) if path.is_file(): checksum = self.manager.store(path) entity.make_id(path.name, checksum) entity.set("contentHash", checksum) entity.set("fileSize", path.stat().st_size) entity.set("fileName", path.name) else: entity.make_id(fixture_path) return path, entity def get_emitted(self, schema=None): entities = list(self.manager.dataset.iterate()) if schema is not None: entities = [e for e in entities if e.schema.is_a(schema)] return entities def get_emitted_by_id(self, id): return self.manager.dataset.get(id) def assertSuccess(self, entity): self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
class TestCase(unittest.TestCase): def setUp(self): # Force tests to use fake configuration service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() balkhash_settings.BACKEND = 'LEVELDB' balkhash_settings.LEVELDB_PATH = mkdtemp() conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, Stage.INGEST) self.manager = Manager(stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive def fixture(self, fixture_path): """Returns a fixture path and a dummy entity""" # clear out entities self.manager.entities = [] self.manager.dataset.delete() cur_path = ensure_path(__file__).parent cur_path = cur_path.joinpath('fixtures') path = cur_path.joinpath(fixture_path) entity = self.manager.make_entity('Document') if path.is_file(): checksum = self.manager.store(path) entity.make_id(path.name, checksum) entity.set('contentHash', checksum) entity.set('fileSize', path.stat().st_size) entity.set('fileName', path.name) else: entity.make_id(fixture_path) return path, entity def get_emitted(self, schema=None): entities = list(self.manager.dataset.iterate()) if schema is not None: entities = [e for e in entities if e.schema.is_a(schema)] return entities def get_emitted_by_id(self, id): return self.manager.dataset.get(id) def assertSuccess(self, entity): self.assertEqual(entity.first('processingStatus'), self.manager.STATUS_SUCCESS)
def _ingest_path(db, conn, dataset, path, languages=[]): context = {'languages': languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" context = {'languages': languages} conn = get_redis() queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, dataset) manager = Manager(queue, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def _ingest_path(db, conn, dataset, path, languages=[]): context = {"languages": languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity("Document") checksum = manager.store(path) entity.set("contentHash", checksum) entity.make_id(checksum) entity.set("fileName", path.name) log.info("Queue: %r", entity.to_dict()) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()