Esempio n. 1
0
def ingest(file_path):
    """Simple wrapper to run ingestors on a file.

    :param file_path: The file path.
    :type file_path: str
    :return: Tuple, the ingestor object, its data and detached ingestors data.
    :rtype: tuple
    """
    manager = Manager({})
    return manager.ingest(file_path)
Esempio n. 2
0
class TestCase(unittest.TestCase):
    def setUp(self):
        # Force tests to use fake configuration
        ingestors_settings.TESTING = True
        service_settings.REDIS_URL = None
        service_settings.ARCHIVE_TYPE = "file"
        service_settings.ARCHIVE_PATH = mkdtemp()
        ftmstore_settings.DATABASE_URI = "sqlite://"
        conn = get_fakeredis()
        job = Job.create(conn, "test")
        stage = Stage(job, OP_INGEST)
        dataset = get_dataset(job.dataset.name, OP_INGEST)
        Tags("ingest_cache").delete()
        self.manager = Manager(dataset, stage, {})
        self.manager.entities = []
        self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
        self.manager.queue_entity = types.MethodType(queue_entity,
                                                     self.manager)  # noqa
        self.archive = init_archive()
        self.manager._archive = self.archive

    def fixture(self, fixture_path):
        """Returns a fixture path and a dummy entity"""
        # clear out entities
        self.manager.entities = []
        self.manager.dataset.delete()
        cur_path = ensure_path(__file__).parent
        cur_path = cur_path.joinpath("fixtures")
        path = cur_path.joinpath(fixture_path)
        entity = self.manager.make_entity("Document")
        if not path.exists():
            raise RuntimeError(path)
        if path.is_file():
            checksum = self.manager.store(path)
            entity.make_id(path.name, checksum)
            entity.set("contentHash", checksum)
            entity.set("fileSize", path.stat().st_size)
            entity.set("fileName", path.name)
        else:
            entity.make_id(fixture_path)
        return path, entity

    def get_emitted(self, schema=None):
        entities = list(self.manager.dataset.iterate())
        if schema is not None:
            entities = [e for e in entities if e.schema.is_a(schema)]
        return entities

    def get_emitted_by_id(self, id):
        return self.manager.dataset.get(id)

    def assertSuccess(self, entity):
        self.assertEqual(entity.first("processingStatus"),
                         self.manager.STATUS_SUCCESS)
Esempio n. 3
0
class TestCase(unittest.TestCase):
    def setUp(self):
        # Force tests to use fake configuration
        service_settings.REDIS_URL = None
        service_settings.ARCHIVE_TYPE = 'file'
        service_settings.ARCHIVE_PATH = mkdtemp()
        balkhash_settings.BACKEND = 'LEVELDB'
        balkhash_settings.LEVELDB_PATH = mkdtemp()
        conn = get_fakeredis()
        job = Job.create(conn, 'test')
        stage = Stage(job, Stage.INGEST)
        self.manager = Manager(stage, {})
        self.manager.entities = []
        self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
        self.manager.queue_entity = types.MethodType(queue_entity,
                                                     self.manager)  # noqa
        self.archive = init_archive()
        self.manager._archive = self.archive

    def fixture(self, fixture_path):
        """Returns a fixture path and a dummy entity"""
        # clear out entities
        self.manager.entities = []
        self.manager.dataset.delete()
        cur_path = ensure_path(__file__).parent
        cur_path = cur_path.joinpath('fixtures')
        path = cur_path.joinpath(fixture_path)
        entity = self.manager.make_entity('Document')
        if path.is_file():
            checksum = self.manager.store(path)
            entity.make_id(path.name, checksum)
            entity.set('contentHash', checksum)
            entity.set('fileSize', path.stat().st_size)
            entity.set('fileName', path.name)
        else:
            entity.make_id(fixture_path)
        return path, entity

    def get_emitted(self, schema=None):
        entities = list(self.manager.dataset.iterate())
        if schema is not None:
            entities = [e for e in entities if e.schema.is_a(schema)]
        return entities

    def get_emitted_by_id(self, id):
        return self.manager.dataset.get(id)

    def assertSuccess(self, entity):
        self.assertEqual(entity.first('processingStatus'),
                         self.manager.STATUS_SUCCESS)
Esempio n. 4
0
 def setUp(self):
     # Force tests to use fake configuration
     service_settings.REDIS_URL = None
     service_settings.ARCHIVE_TYPE = 'file'
     service_settings.ARCHIVE_PATH = mkdtemp()
     balkhash_settings.BACKEND = 'LEVELDB'
     balkhash_settings.LEVELDB_PATH = mkdtemp()
     conn = get_fakeredis()
     self.queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, 'test')
     self.manager = Manager(self.queue, {})
     self.manager.entities = []
     self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
     self.manager.queue_entity = types.MethodType(queue_entity,
                                                  self.manager)  # noqa
     self.archive = init_archive()
     self.manager._archive = self.archive
Esempio n. 5
0
 def setUp(self):
     # Force tests to use fake configuration
     ingestors_settings.TESTING = True
     service_settings.REDIS_URL = None
     service_settings.ARCHIVE_TYPE = 'file'
     service_settings.ARCHIVE_PATH = mkdtemp()
     ftmstore_settings.DATABASE_URI = 'sqlite://'
     conn = get_fakeredis()
     job = Job.create(conn, 'test')
     stage = Stage(job, OP_INGEST)
     dataset = get_dataset(job.dataset.name, OP_INGEST)
     self.manager = Manager(dataset, stage, {})
     self.manager.entities = []
     self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
     self.manager.queue_entity = types.MethodType(queue_entity,
                                                  self.manager)  # noqa
     self.archive = init_archive()
     self.manager._archive = self.archive
Esempio n. 6
0
 def handle(self, task):
     manager = Manager(task.stage, task.context)
     entity = model.get_proxy(task.payload)
     log.debug("Ingest: %r", entity)
     manager.ingest_entity(entity)
     manager.close()
     self.dispatch_next(task, manager.emitted)
Esempio n. 7
0
 def _analyze(self, task):
     entity_ids = task.payload.get('entity_ids')
     dataset = Manager.get_dataset(task.stage, task.context)
     analyzer = None
     for entity in dataset.partials(entity_id=entity_ids):
         if analyzer is None or analyzer.entity.id != entity.id:
             if analyzer is not None:
                 analyzer.flush()
             # log.debug("Analyze: %r", entity)
             analyzer = Analyzer(dataset, entity)
         analyzer.feed(entity)
     if analyzer is not None:
         analyzer.flush()
     return entity_ids
Esempio n. 8
0
 def _ingest(self, dataset, task):
     manager = Manager(dataset, task.stage, task.context)
     entity = model.get_proxy(task.payload)
     log.debug("Ingest: %r", entity)
     try:
         manager.ingest_entity(entity)
     finally:
         manager.close()
     return manager.emitted
Esempio n. 9
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {'languages': languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Esempio n. 10
0
def ingest(path, dataset, languages=None):
    """Queue a set of files for ingest."""
    context = {'languages': languages}
    conn = get_redis()
    queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, dataset)
    manager = Manager(queue, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Esempio n. 11
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {"languages": languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity("Document")
            checksum = manager.store(path)
            entity.set("contentHash", checksum)
            entity.make_id(checksum)
            entity.set("fileName", path.name)
            log.info("Queue: %r", entity.to_dict())
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Esempio n. 12
0
 def handle_task(cls, queue, payload, context):
     queue.task_done()
     try:
         manager = Manager(queue, context)
         entity = model.get_proxy(payload)
         log.debug("Ingest: %r", entity)
         manager.ingest_entity(entity)
         manager.close()
         cls.handle_done(queue)
     except (KeyboardInterrupt, SystemExit, RuntimeError):
         cls.handle_retry(queue, payload, context)
         cls.handle_done(queue)
         raise
     except Exception:
         cls.handle_retry(queue, payload, context)
         cls.handle_done(queue)
         log.exception("Processing failed.")