def queue(cls, stage, state, data): crawler = state.get('crawler') job = Job(conn, str(crawler), state['run_id']) job_stage = job.get_stage(stage) queue_length = job_stage.get_status().get('pending') if queue_length > MAX_QUEUE_LENGTH: msg = "queue for %s:%s too big." raise QueueTooBigError(msg % (str(crawler), stage)) job_stage.queue(payload=data, context=state)
def queue(cls, stage, state, data): crawler = state.get("crawler") job = Job(conn, str(crawler), state["run_id"]) job_stage = job.get_stage(stage.namespaced_name) job_stage.sync() queue_length = job_stage.get_status().get("pending") if queue_length > MAX_QUEUE_LENGTH: msg = "queue for %s:%s too big." raise QueueTooBigError(msg % (str(crawler), stage)) job_stage.queue(payload=data, context=state)
def test_job_queue(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") status = stage.get_status() assert status["pending"] == 0 assert status["finished"] == 0 assert job.is_done() stage.queue({"test": "foo"}, {}) status = job.get_status() assert status["pending"] == 1 assert status["finished"] == 0 assert status["running"] == 0 assert not job.is_done() task = Stage.get_task(self.conn, "ingest", timeout=None) assert task.job.dataset.name == job.dataset.name assert task.payload["test"] == "foo" status = job.get_status() assert status["pending"] == 0 assert status["running"] == 1 assert status["finished"] == 0 assert not job.is_done() task.done() status = job.get_status() assert status["pending"] == 0 assert status["running"] == 0 assert status["finished"] == 1 assert job.is_done()
def get_session_id(): role_id = stringify(request.authz.id) or 'anonymous' session_id = None if hasattr(request, '_session_id'): session_id = stringify(request._session_id) session_id = session_id or Job.random_id() return '%s:%s' % (role_id, session_id)
def test_job_queue(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) status = stage.get_status() assert status['pending'] == 0 assert status['finished'] == 0 assert job.is_done() stage.queue({'test': 'foo'}, {}) status = job.get_status() assert status['pending'] == 1 assert status['finished'] == 0 assert status['running'] == 0 assert not job.is_done() task = Stage.get_task(self.conn, Stage.INGEST, timeout=None) assert task.job.dataset.name == job.dataset.name assert task.payload['test'] == 'foo' status = job.get_status() assert status['pending'] == 0 assert status['running'] == 1 assert status['finished'] == 0 assert not job.is_done() task.done() status = job.get_status() assert status['pending'] == 0 assert status['running'] == 0 assert status['finished'] == 1 assert job.is_done()
def test_run(self): conn = get_fakeredis() operation = "lala" worker = CountingWorker(conn=conn, stages=[operation]) worker.sync() assert worker.test_done == 0, worker.test_done job = Job.create(conn, "test") stage = job.get_stage(operation) task = stage.queue({}, {}) assert not job.is_done() assert worker.test_done == 0, worker.test_done worker.sync() assert worker.test_done == 1, worker.test_done assert job.is_done() worker.retry(task) assert not job.is_done() worker.sync() assert job.is_done() assert worker.exit_code == 0, worker.exit_code assert worker.test_done == 1, worker.test_done worker._handle_signal(5, None) assert worker.exit_code == 5, worker.exit_code worker.retry(task) worker.run(blocking=False) assert job.is_done() assert worker.exit_code == 0, worker.exit_code worker.num_threads = None worker.retry(task) worker.run(blocking=False) assert job.is_done() assert worker.exit_code == 0, worker.exit_code
def reingest_collection(collection, job_id=None, index=False, flush=True): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() if flush: ingest_flush(collection) for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def reingest_collection(collection, job_id=None, index=False): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() aggregator = get_aggregator(collection) aggregator.delete(origin=OP_ANALYZE) aggregator.delete(origin=OP_INGEST) aggregator.close() for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def test_fake_finished(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") status = stage.get_status() assert status["finished"] == 0 stage.report_finished(500) status = stage.get_status() assert status["finished"] == 500 status = job.dataset.get_status() assert status["finished"] == 500, status
def test_fake_finished(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) status = stage.get_status() assert status['finished'] == 0 stage.report_finished(500) status = stage.get_status() assert status['finished'] == 500 status = job.dataset.get_status() assert status['finished'] == 500, status
def handle(self, status, operation=None, exception=None, task=None, **payload): """Report a processing event that may be related to a task.""" if not WORKER_REPORTING: return task = task or self.task if task is not None: payload["task"] = task.serialize() stage = task.stage else: stage = self.stage dataset = stage.job.dataset.name job_id = stage.job.id operation = operation or stage.stage now = datetime.utcnow() payload.update({ "dataset": dataset, "operation": operation, "job": job_id, "status": status, "updated_at": now, "%s_at" % status: now, "has_error": False, }) if exception is not None: payload.update({ "status": Status.ERROR, "has_error": True, "error_name": exception.__class__.__name__, "error_msg": stringify(exception), }) job = Job(stage.conn, dataset, job_id) stage = job.get_stage(OP_REPORT) stage.queue(payload)
def test_active_dataset_status(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) stage.queue({'test': 'bar'}, {}) status = Dataset.get_active_dataset_status(self.conn) assert len(status['datasets']) == 1 assert status['total'] == 1 assert status['datasets']['test_1']['pending'] == 2 job.dataset.cancel() status = Dataset.get_active_dataset_status(self.conn) assert status['datasets'] == {} assert status['total'] == 0
def test_active_dataset_status(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) stage.queue({"test": "bar"}, {}) status = Dataset.get_active_dataset_status(self.conn) assert len(status["datasets"]) == 1 assert status["total"] == 1 assert status["datasets"]["test_1"]["pending"] == 2 job.dataset.cancel() status = Dataset.get_active_dataset_status(self.conn) assert status["datasets"] == {} assert status["total"] == 0
def test_fetch_multiple_task(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) stage.queue({'test': 'bar'}, {}) status = job.get_status() assert status['pending'] == 2 tasks = list(stage.get_tasks(limit=5)) assert len(tasks) == 2 for task in tasks: assert isinstance(task, Task) assert tasks[0].payload == {'test': 'foo'} assert tasks[1].payload == {'test': 'bar'} job.dataset.cancel()
def test_fetch_multiple_task(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) stage.queue({"test": "bar"}, {}) status = job.get_status() assert status["pending"] == 2 tasks = list(stage.get_tasks(limit=5)) assert len(tasks) == 2 for task in tasks: assert isinstance(task, Task) assert tasks[0].payload == {"test": "foo"} assert tasks[1].payload == {"test": "bar"} job.dataset.cancel()
def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" try: content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) # if the job_id is not set yet and path.is_dir(), we know it is the # first iteration and we don't create an initial root folder as parent # to be consistent with the behaviour of alephclient if path.is_dir() and job_id is None: document = None job_id = Job.random_id() else: meta = {"file_name": path.name} document = Document.save( collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, ) db.session.commit() job_id = job_id or Job.random_id() proxy = document.to_proxy() ingest_flush(collection, entity_id=proxy.id) ingest_entity(collection, proxy, job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id) except OSError: log.exception("Cannot crawl directory: %s", path)
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, "continue_on_error": settings.CONTINUE_ON_ERROR, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {})
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { 'crawler': self.name, 'run_id': run_id or Job.random_id(), 'incremental': settings.INCREMENTAL } if incremental is not None: state['incremental'] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) Queue.queue(self.init_stage, state, {})
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {})
def _ingest_path(db, conn, dataset, path, languages=[]): context = {'languages': languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def test_queue_clear(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) status = stage.get_status() assert status['pending'] == 1 job.dataset.cancel() status = stage.get_status() assert status['pending'] == 0 stage.queue({'test': 'foo'}, {}) status = stage.get_status() assert status['pending'] == 1 job.remove() status = stage.get_status() assert status['pending'] == 0
def setUp(self): # Force tests to use fake configuration service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() balkhash_settings.BACKEND = 'LEVELDB' balkhash_settings.LEVELDB_PATH = mkdtemp() conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, Stage.INGEST) self.manager = Manager(stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def test_queue_clear(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) status = stage.get_status() assert status["pending"] == 1 job.dataset.cancel() status = stage.get_status() assert status["pending"] == 0 stage.queue({"test": "foo"}, {}) status = stage.get_status() assert status["pending"] == 1 job.remove() status = stage.get_status() assert status["pending"] == 0
def load_entities(foreign_id, infile, unsafe=False): """Load FtM entities from the specified iJSON file.""" collection = ensure_collection(foreign_id, foreign_id) def read_entities(): for idx in count(1): line = infile.readline() if not line: return if idx % 1000 == 0: log.info("[%s] Loaded %s entities from: %s", foreign_id, idx, infile.name) yield json.loads(line) job_id = Job.random_id() log.info("Loading [%s]: %s", job_id, foreign_id) bulk_write(collection, read_entities(), job_id=job_id, unsafe=unsafe) update_collection(collection)
def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = 'sqlite://' conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, OP_INGEST) self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def _ingest_path(db, conn, dataset, path, languages=[]): context = {"languages": languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity("Document") checksum = manager.store(path) entity.set("contentHash", checksum) entity.make_id(checksum) entity.set("fileName", path.name) log.info("Queue: %r", entity.to_dict()) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" context = {'languages': languages} conn = get_redis() job = Job.create(conn, dataset) stage = job.get_stage(Stage.INGEST) manager = Manager(stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) meta = {'file_name': path.name} document = Document.save(collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta) db.session.commit() job_id = job_id or Job.random_id() ingest_entity(collection, document.to_proxy(), job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id)
def abort_run(cls, crawler, run_id): conn.sadd(make_key(crawler, "runs_abort"), run_id) conn.expire(make_key(crawler, "runs_abort"), REDIS_LONG) conn.setnx(make_key(crawler, "run", run_id, "end"), pack_now()) job = Job(conn, crawler.queue, run_id) job.remove()
def get_stage(collection, stage, job_id=None): job_id = job_id or Job.random_id() job = Job(kv, collection.foreign_id, job_id) return job.get_stage(stage)