コード例 #1
0
ファイル: queue.py プロジェクト: pombredanne/memorious
 def queue(cls, stage, state, data):
     crawler = state.get('crawler')
     job = Job(conn, str(crawler), state['run_id'])
     job_stage = job.get_stage(stage)
     queue_length = job_stage.get_status().get('pending')
     if queue_length > MAX_QUEUE_LENGTH:
         msg = "queue for %s:%s too big."
         raise QueueTooBigError(msg % (str(crawler), stage))
     job_stage.queue(payload=data, context=state)
コード例 #2
0
ファイル: queue.py プロジェクト: vishalbelsare/memorious
 def queue(cls, stage, state, data):
     crawler = state.get("crawler")
     job = Job(conn, str(crawler), state["run_id"])
     job_stage = job.get_stage(stage.namespaced_name)
     job_stage.sync()
     queue_length = job_stage.get_status().get("pending")
     if queue_length > MAX_QUEUE_LENGTH:
         msg = "queue for %s:%s too big."
         raise QueueTooBigError(msg % (str(crawler), stage))
     job_stage.queue(payload=data, context=state)
コード例 #3
0
 def test_job_queue(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage("ingest")
     status = stage.get_status()
     assert status["pending"] == 0
     assert status["finished"] == 0
     assert job.is_done()
     stage.queue({"test": "foo"}, {})
     status = job.get_status()
     assert status["pending"] == 1
     assert status["finished"] == 0
     assert status["running"] == 0
     assert not job.is_done()
     task = Stage.get_task(self.conn, "ingest", timeout=None)
     assert task.job.dataset.name == job.dataset.name
     assert task.payload["test"] == "foo"
     status = job.get_status()
     assert status["pending"] == 0
     assert status["running"] == 1
     assert status["finished"] == 0
     assert not job.is_done()
     task.done()
     status = job.get_status()
     assert status["pending"] == 0
     assert status["running"] == 0
     assert status["finished"] == 1
     assert job.is_done()
コード例 #4
0
ファイル: util.py プロジェクト: stofstar/aleph
def get_session_id():
    role_id = stringify(request.authz.id) or 'anonymous'
    session_id = None
    if hasattr(request, '_session_id'):
        session_id = stringify(request._session_id)
    session_id = session_id or Job.random_id()
    return '%s:%s' % (role_id, session_id)
コード例 #5
0
 def test_job_queue(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage(Stage.INGEST)
     status = stage.get_status()
     assert status['pending'] == 0
     assert status['finished'] == 0
     assert job.is_done()
     stage.queue({'test': 'foo'}, {})
     status = job.get_status()
     assert status['pending'] == 1
     assert status['finished'] == 0
     assert status['running'] == 0
     assert not job.is_done()
     task = Stage.get_task(self.conn, Stage.INGEST,
                           timeout=None)
     assert task.job.dataset.name == job.dataset.name
     assert task.payload['test'] == 'foo'
     status = job.get_status()
     assert status['pending'] == 0
     assert status['running'] == 1
     assert status['finished'] == 0
     assert not job.is_done()
     task.done()
     status = job.get_status()
     assert status['pending'] == 0
     assert status['running'] == 0
     assert status['finished'] == 1
     assert job.is_done()
コード例 #6
0
ファイル: test_worker.py プロジェクト: alephdata/servicelayer
 def test_run(self):
     conn = get_fakeredis()
     operation = "lala"
     worker = CountingWorker(conn=conn, stages=[operation])
     worker.sync()
     assert worker.test_done == 0, worker.test_done
     job = Job.create(conn, "test")
     stage = job.get_stage(operation)
     task = stage.queue({}, {})
     assert not job.is_done()
     assert worker.test_done == 0, worker.test_done
     worker.sync()
     assert worker.test_done == 1, worker.test_done
     assert job.is_done()
     worker.retry(task)
     assert not job.is_done()
     worker.sync()
     assert job.is_done()
     assert worker.exit_code == 0, worker.exit_code
     assert worker.test_done == 1, worker.test_done
     worker._handle_signal(5, None)
     assert worker.exit_code == 5, worker.exit_code
     worker.retry(task)
     worker.run(blocking=False)
     assert job.is_done()
     assert worker.exit_code == 0, worker.exit_code
     worker.num_threads = None
     worker.retry(task)
     worker.run(blocking=False)
     assert job.is_done()
     assert worker.exit_code == 0, worker.exit_code
コード例 #7
0
def reingest_collection(collection, job_id=None, index=False, flush=True):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    if flush:
        ingest_flush(collection)
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
コード例 #8
0
ファイル: collections.py プロジェクト: djoffrey/aleph
def reingest_collection(collection, job_id=None, index=False):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=OP_ANALYZE)
    aggregator.delete(origin=OP_INGEST)
    aggregator.close()
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
コード例 #9
0
 def test_fake_finished(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage("ingest")
     status = stage.get_status()
     assert status["finished"] == 0
     stage.report_finished(500)
     status = stage.get_status()
     assert status["finished"] == 500
     status = job.dataset.get_status()
     assert status["finished"] == 500, status
コード例 #10
0
 def test_fake_finished(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage(Stage.INGEST)
     status = stage.get_status()
     assert status['finished'] == 0
     stage.report_finished(500)
     status = stage.get_status()
     assert status['finished'] == 500
     status = job.dataset.get_status()
     assert status['finished'] == 500, status
コード例 #11
0
ファイル: reporting.py プロジェクト: alephdata/servicelayer
    def handle(self,
               status,
               operation=None,
               exception=None,
               task=None,
               **payload):
        """Report a processing event that may be related to a task."""
        if not WORKER_REPORTING:
            return

        task = task or self.task
        if task is not None:
            payload["task"] = task.serialize()
            stage = task.stage
        else:
            stage = self.stage
        dataset = stage.job.dataset.name
        job_id = stage.job.id
        operation = operation or stage.stage

        now = datetime.utcnow()
        payload.update({
            "dataset": dataset,
            "operation": operation,
            "job": job_id,
            "status": status,
            "updated_at": now,
            "%s_at" % status: now,
            "has_error": False,
        })

        if exception is not None:
            payload.update({
                "status": Status.ERROR,
                "has_error": True,
                "error_name": exception.__class__.__name__,
                "error_msg": stringify(exception),
            })

        job = Job(stage.conn, dataset, job_id)
        stage = job.get_stage(OP_REPORT)
        stage.queue(payload)
コード例 #12
0
 def test_active_dataset_status(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage(Stage.INGEST)
     stage.queue({'test': 'foo'}, {})
     stage.queue({'test': 'bar'}, {})
     status = Dataset.get_active_dataset_status(self.conn)
     assert len(status['datasets']) == 1
     assert status['total'] == 1
     assert status['datasets']['test_1']['pending'] == 2
     job.dataset.cancel()
     status = Dataset.get_active_dataset_status(self.conn)
     assert status['datasets'] == {}
     assert status['total'] == 0
コード例 #13
0
 def test_active_dataset_status(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage("ingest")
     stage.queue({"test": "foo"}, {})
     stage.queue({"test": "bar"}, {})
     status = Dataset.get_active_dataset_status(self.conn)
     assert len(status["datasets"]) == 1
     assert status["total"] == 1
     assert status["datasets"]["test_1"]["pending"] == 2
     job.dataset.cancel()
     status = Dataset.get_active_dataset_status(self.conn)
     assert status["datasets"] == {}
     assert status["total"] == 0
コード例 #14
0
 def test_fetch_multiple_task(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage(Stage.INGEST)
     stage.queue({'test': 'foo'}, {})
     stage.queue({'test': 'bar'}, {})
     status = job.get_status()
     assert status['pending'] == 2
     tasks = list(stage.get_tasks(limit=5))
     assert len(tasks) == 2
     for task in tasks:
         assert isinstance(task, Task)
     assert tasks[0].payload == {'test': 'foo'}
     assert tasks[1].payload == {'test': 'bar'}
     job.dataset.cancel()
コード例 #15
0
 def test_fetch_multiple_task(self):
     job = Job.create(self.conn, self.dataset)
     stage = job.get_stage("ingest")
     stage.queue({"test": "foo"}, {})
     stage.queue({"test": "bar"}, {})
     status = job.get_status()
     assert status["pending"] == 2
     tasks = list(stage.get_tasks(limit=5))
     assert len(tasks) == 2
     for task in tasks:
         assert isinstance(task, Task)
     assert tasks[0].payload == {"test": "foo"}
     assert tasks[1].payload == {"test": "bar"}
     job.dataset.cancel()
コード例 #16
0
ファイル: documents.py プロジェクト: wayne9qiu/aleph
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    try:
        content_hash = None
        if not path.is_dir():
            content_hash = archive.archive_file(path)
        foreign_id = path.name
        if parent is not None:
            foreign_id = os.path.join(parent.foreign_id, foreign_id)

        # if the job_id is not set yet and path.is_dir(), we know it is the
        # first iteration and we don't create an initial root folder as parent
        # to be consistent with the behaviour of alephclient
        if path.is_dir() and job_id is None:
            document = None
            job_id = Job.random_id()
        else:
            meta = {"file_name": path.name}
            document = Document.save(
                collection,
                parent=parent,
                foreign_id=foreign_id,
                content_hash=content_hash,
                meta=meta,
            )
            db.session.commit()
            job_id = job_id or Job.random_id()
            proxy = document.to_proxy()
            ingest_flush(collection, entity_id=proxy.id)
            ingest_entity(collection, proxy, job_id=job_id)
            log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)

        if path.is_dir():
            for child in path.iterdir():
                crawl_directory(collection, child, document, job_id)
    except OSError:
        log.exception("Cannot crawl directory: %s", path)
コード例 #17
0
ファイル: crawler.py プロジェクト: vishalbelsare/memorious
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
            "continue_on_error": settings.CONTINUE_ON_ERROR,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
コード例 #18
0
ファイル: crawler.py プロジェクト: x0rzkov/memorious
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id or Job.random_id(),
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        Queue.queue(self.init_stage, state, {})
コード例 #19
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
コード例 #20
0
ファイル: cli.py プロジェクト: djoffrey/aleph
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {'languages': languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
コード例 #21
0
    def test_queue_clear(self):
        job = Job.create(self.conn, self.dataset)
        stage = job.get_stage(Stage.INGEST)

        stage.queue({'test': 'foo'}, {})
        status = stage.get_status()
        assert status['pending'] == 1
        job.dataset.cancel()
        status = stage.get_status()
        assert status['pending'] == 0

        stage.queue({'test': 'foo'}, {})
        status = stage.get_status()
        assert status['pending'] == 1
        job.remove()
        status = stage.get_status()
        assert status['pending'] == 0
コード例 #22
0
ファイル: support.py プロジェクト: we1l1n/aleph
 def setUp(self):
     # Force tests to use fake configuration
     service_settings.REDIS_URL = None
     service_settings.ARCHIVE_TYPE = 'file'
     service_settings.ARCHIVE_PATH = mkdtemp()
     balkhash_settings.BACKEND = 'LEVELDB'
     balkhash_settings.LEVELDB_PATH = mkdtemp()
     conn = get_fakeredis()
     job = Job.create(conn, 'test')
     stage = Stage(job, Stage.INGEST)
     self.manager = Manager(stage, {})
     self.manager.entities = []
     self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
     self.manager.queue_entity = types.MethodType(queue_entity,
                                                  self.manager)  # noqa
     self.archive = init_archive()
     self.manager._archive = self.archive
コード例 #23
0
    def test_queue_clear(self):
        job = Job.create(self.conn, self.dataset)
        stage = job.get_stage("ingest")

        stage.queue({"test": "foo"}, {})
        status = stage.get_status()
        assert status["pending"] == 1
        job.dataset.cancel()
        status = stage.get_status()
        assert status["pending"] == 0

        stage.queue({"test": "foo"}, {})
        status = stage.get_status()
        assert status["pending"] == 1
        job.remove()
        status = stage.get_status()
        assert status["pending"] == 0
コード例 #24
0
ファイル: manage.py プロジェクト: sam-heller/aleph
def load_entities(foreign_id, infile, unsafe=False):
    """Load FtM entities from the specified iJSON file."""
    collection = ensure_collection(foreign_id, foreign_id)

    def read_entities():
        for idx in count(1):
            line = infile.readline()
            if not line:
                return
            if idx % 1000 == 0:
                log.info("[%s] Loaded %s entities from: %s",
                         foreign_id, idx, infile.name)
            yield json.loads(line)

    job_id = Job.random_id()
    log.info("Loading [%s]: %s", job_id, foreign_id)
    bulk_write(collection, read_entities(), job_id=job_id, unsafe=unsafe)
    update_collection(collection)
コード例 #25
0
ファイル: support.py プロジェクト: djoffrey/aleph
 def setUp(self):
     # Force tests to use fake configuration
     ingestors_settings.TESTING = True
     service_settings.REDIS_URL = None
     service_settings.ARCHIVE_TYPE = 'file'
     service_settings.ARCHIVE_PATH = mkdtemp()
     ftmstore_settings.DATABASE_URI = 'sqlite://'
     conn = get_fakeredis()
     job = Job.create(conn, 'test')
     stage = Stage(job, OP_INGEST)
     dataset = get_dataset(job.dataset.name, OP_INGEST)
     self.manager = Manager(dataset, stage, {})
     self.manager.entities = []
     self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
     self.manager.queue_entity = types.MethodType(queue_entity,
                                                  self.manager)  # noqa
     self.archive = init_archive()
     self.manager._archive = self.archive
コード例 #26
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {"languages": languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity("Document")
            checksum = manager.store(path)
            entity.set("contentHash", checksum)
            entity.make_id(checksum)
            entity.set("fileName", path.name)
            log.info("Queue: %r", entity.to_dict())
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
コード例 #27
0
def ingest(path, dataset, languages=None):
    """Queue a set of files for ingest."""
    context = {'languages': languages}
    conn = get_redis()
    job = Job.create(conn, dataset)
    stage = job.get_stage(Stage.INGEST)
    manager = Manager(stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
コード例 #28
0
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    content_hash = None
    if not path.is_dir():
        content_hash = archive.archive_file(path)
    foreign_id = path.name
    if parent is not None:
        foreign_id = os.path.join(parent.foreign_id, foreign_id)
    meta = {'file_name': path.name}
    document = Document.save(collection,
                             parent=parent,
                             foreign_id=foreign_id,
                             content_hash=content_hash,
                             meta=meta)
    db.session.commit()
    job_id = job_id or Job.random_id()
    ingest_entity(collection, document.to_proxy(), job_id=job_id)
    log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)
    if path.is_dir():
        for child in path.iterdir():
            crawl_directory(collection, child, document, job_id)
コード例 #29
0
ファイル: crawl.py プロジェクト: vishalbelsare/memorious
 def abort_run(cls, crawler, run_id):
     conn.sadd(make_key(crawler, "runs_abort"), run_id)
     conn.expire(make_key(crawler, "runs_abort"), REDIS_LONG)
     conn.setnx(make_key(crawler, "run", run_id, "end"), pack_now())
     job = Job(conn, crawler.queue, run_id)
     job.remove()
コード例 #30
0
ファイル: queues.py プロジェクト: wdsn/aleph
def get_stage(collection, stage, job_id=None):
    job_id = job_id or Job.random_id()
    job = Job(kv, collection.foreign_id, job_id)
    return job.get_stage(stage)