def test_run(self): conn = get_fakeredis() operation = "lala" worker = CountingWorker(conn=conn, stages=[operation]) worker.sync() assert worker.test_done == 0, worker.test_done job = Job.create(conn, "test") stage = job.get_stage(operation) task = stage.queue({}, {}) assert not job.is_done() assert worker.test_done == 0, worker.test_done worker.sync() assert worker.test_done == 1, worker.test_done assert job.is_done() worker.retry(task) assert not job.is_done() worker.sync() assert job.is_done() assert worker.exit_code == 0, worker.exit_code assert worker.test_done == 1, worker.test_done worker._handle_signal(5, None) assert worker.exit_code == 5, worker.exit_code worker.retry(task) worker.run(blocking=False) assert job.is_done() assert worker.exit_code == 0, worker.exit_code worker.num_threads = None worker.retry(task) worker.run(blocking=False) assert job.is_done() assert worker.exit_code == 0, worker.exit_code
def test_job_queue(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) status = stage.get_status() assert status['pending'] == 0 assert status['finished'] == 0 assert job.is_done() stage.queue({'test': 'foo'}, {}) status = job.get_status() assert status['pending'] == 1 assert status['finished'] == 0 assert status['running'] == 0 assert not job.is_done() task = Stage.get_task(self.conn, Stage.INGEST, timeout=None) assert task.job.dataset.name == job.dataset.name assert task.payload['test'] == 'foo' status = job.get_status() assert status['pending'] == 0 assert status['running'] == 1 assert status['finished'] == 0 assert not job.is_done() task.done() status = job.get_status() assert status['pending'] == 0 assert status['running'] == 0 assert status['finished'] == 1 assert job.is_done()
def test_job_queue(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") status = stage.get_status() assert status["pending"] == 0 assert status["finished"] == 0 assert job.is_done() stage.queue({"test": "foo"}, {}) status = job.get_status() assert status["pending"] == 1 assert status["finished"] == 0 assert status["running"] == 0 assert not job.is_done() task = Stage.get_task(self.conn, "ingest", timeout=None) assert task.job.dataset.name == job.dataset.name assert task.payload["test"] == "foo" status = job.get_status() assert status["pending"] == 0 assert status["running"] == 1 assert status["finished"] == 0 assert not job.is_done() task.done() status = job.get_status() assert status["pending"] == 0 assert status["running"] == 0 assert status["finished"] == 1 assert job.is_done()
def test_fake_finished(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) status = stage.get_status() assert status['finished'] == 0 stage.report_finished(500) status = stage.get_status() assert status['finished'] == 500 status = job.dataset.get_status() assert status['finished'] == 500, status
def test_fake_finished(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") status = stage.get_status() assert status["finished"] == 0 stage.report_finished(500) status = stage.get_status() assert status["finished"] == 500 status = job.dataset.get_status() assert status["finished"] == 500, status
def test_active_dataset_status(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) stage.queue({'test': 'bar'}, {}) status = Dataset.get_active_dataset_status(self.conn) assert len(status['datasets']) == 1 assert status['total'] == 1 assert status['datasets']['test_1']['pending'] == 2 job.dataset.cancel() status = Dataset.get_active_dataset_status(self.conn) assert status['datasets'] == {} assert status['total'] == 0
def test_active_dataset_status(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) stage.queue({"test": "bar"}, {}) status = Dataset.get_active_dataset_status(self.conn) assert len(status["datasets"]) == 1 assert status["total"] == 1 assert status["datasets"]["test_1"]["pending"] == 2 job.dataset.cancel() status = Dataset.get_active_dataset_status(self.conn) assert status["datasets"] == {} assert status["total"] == 0
def test_fetch_multiple_task(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) stage.queue({'test': 'bar'}, {}) status = job.get_status() assert status['pending'] == 2 tasks = list(stage.get_tasks(limit=5)) assert len(tasks) == 2 for task in tasks: assert isinstance(task, Task) assert tasks[0].payload == {'test': 'foo'} assert tasks[1].payload == {'test': 'bar'} job.dataset.cancel()
def test_fetch_multiple_task(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) stage.queue({"test": "bar"}, {}) status = job.get_status() assert status["pending"] == 2 tasks = list(stage.get_tasks(limit=5)) assert len(tasks) == 2 for task in tasks: assert isinstance(task, Task) assert tasks[0].payload == {"test": "foo"} assert tasks[1].payload == {"test": "bar"} job.dataset.cancel()
def test_queue_clear(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) status = stage.get_status() assert status['pending'] == 1 job.dataset.cancel() status = stage.get_status() assert status['pending'] == 0 stage.queue({'test': 'foo'}, {}) status = stage.get_status() assert status['pending'] == 1 job.remove() status = stage.get_status() assert status['pending'] == 0
def _ingest_path(db, conn, dataset, path, languages=[]): context = {'languages': languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def setUp(self): # Force tests to use fake configuration service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() balkhash_settings.BACKEND = 'LEVELDB' balkhash_settings.LEVELDB_PATH = mkdtemp() conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, Stage.INGEST) self.manager = Manager(stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def test_queue_clear(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) status = stage.get_status() assert status["pending"] == 1 job.dataset.cancel() status = stage.get_status() assert status["pending"] == 0 stage.queue({"test": "foo"}, {}) status = stage.get_status() assert status["pending"] == 1 job.remove() status = stage.get_status() assert status["pending"] == 0
def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = 'sqlite://' conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, OP_INGEST) self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def _ingest_path(db, conn, dataset, path, languages=[]): context = {"languages": languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity("Document") checksum = manager.store(path) entity.set("contentHash", checksum) entity.make_id(checksum) entity.set("fileName", path.name) log.info("Queue: %r", entity.to_dict()) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" context = {'languages': languages} conn = get_redis() job = Job.create(conn, dataset) stage = job.get_stage(Stage.INGEST) manager = Manager(stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def setUp(self): self.conn = get_redis() self.dataset = "my-dataset" self.operation = "OP_FOO" self.job = Job.create(self.conn, self.dataset)