Ejemplo n.º 1
0
class TestCase(unittest.TestCase):
    def setUp(self):
        # Force tests to use fake configuration
        ingestors_settings.TESTING = True
        service_settings.REDIS_URL = None
        service_settings.ARCHIVE_TYPE = "file"
        service_settings.ARCHIVE_PATH = mkdtemp()
        ftmstore_settings.DATABASE_URI = "sqlite://"
        conn = get_fakeredis()
        job = Job.create(conn, "test")
        stage = Stage(job, OP_INGEST)
        dataset = get_dataset(job.dataset.name, OP_INGEST)
        Tags("ingest_cache").delete()
        self.manager = Manager(dataset, stage, {})
        self.manager.entities = []
        self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
        self.manager.queue_entity = types.MethodType(queue_entity,
                                                     self.manager)  # noqa
        self.archive = init_archive()
        self.manager._archive = self.archive

    def fixture(self, fixture_path):
        """Returns a fixture path and a dummy entity"""
        # clear out entities
        self.manager.entities = []
        self.manager.dataset.delete()
        cur_path = ensure_path(__file__).parent
        cur_path = cur_path.joinpath("fixtures")
        path = cur_path.joinpath(fixture_path)
        entity = self.manager.make_entity("Document")
        if not path.exists():
            raise RuntimeError(path)
        if path.is_file():
            checksum = self.manager.store(path)
            entity.make_id(path.name, checksum)
            entity.set("contentHash", checksum)
            entity.set("fileSize", path.stat().st_size)
            entity.set("fileName", path.name)
        else:
            entity.make_id(fixture_path)
        return path, entity

    def get_emitted(self, schema=None):
        entities = list(self.manager.dataset.iterate())
        if schema is not None:
            entities = [e for e in entities if e.schema.is_a(schema)]
        return entities

    def get_emitted_by_id(self, id):
        return self.manager.dataset.get(id)

    def assertSuccess(self, entity):
        self.assertEqual(entity.first("processingStatus"),
                         self.manager.STATUS_SUCCESS)
Ejemplo n.º 2
0
class TestCase(unittest.TestCase):
    def setUp(self):
        # Force tests to use fake configuration
        service_settings.REDIS_URL = None
        service_settings.ARCHIVE_TYPE = 'file'
        service_settings.ARCHIVE_PATH = mkdtemp()
        balkhash_settings.BACKEND = 'LEVELDB'
        balkhash_settings.LEVELDB_PATH = mkdtemp()
        conn = get_fakeredis()
        job = Job.create(conn, 'test')
        stage = Stage(job, Stage.INGEST)
        self.manager = Manager(stage, {})
        self.manager.entities = []
        self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
        self.manager.queue_entity = types.MethodType(queue_entity,
                                                     self.manager)  # noqa
        self.archive = init_archive()
        self.manager._archive = self.archive

    def fixture(self, fixture_path):
        """Returns a fixture path and a dummy entity"""
        # clear out entities
        self.manager.entities = []
        self.manager.dataset.delete()
        cur_path = ensure_path(__file__).parent
        cur_path = cur_path.joinpath('fixtures')
        path = cur_path.joinpath(fixture_path)
        entity = self.manager.make_entity('Document')
        if path.is_file():
            checksum = self.manager.store(path)
            entity.make_id(path.name, checksum)
            entity.set('contentHash', checksum)
            entity.set('fileSize', path.stat().st_size)
            entity.set('fileName', path.name)
        else:
            entity.make_id(fixture_path)
        return path, entity

    def get_emitted(self, schema=None):
        entities = list(self.manager.dataset.iterate())
        if schema is not None:
            entities = [e for e in entities if e.schema.is_a(schema)]
        return entities

    def get_emitted_by_id(self, id):
        return self.manager.dataset.get(id)

    def assertSuccess(self, entity):
        self.assertEqual(entity.first('processingStatus'),
                         self.manager.STATUS_SUCCESS)
Ejemplo n.º 3
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {'languages': languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Ejemplo n.º 4
0
def ingest(path, dataset, languages=None):
    """Queue a set of files for ingest."""
    context = {'languages': languages}
    conn = get_redis()
    queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, dataset)
    manager = Manager(queue, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Ejemplo n.º 5
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {"languages": languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity("Document")
            checksum = manager.store(path)
            entity.set("contentHash", checksum)
            entity.make_id(checksum)
            entity.set("fileName", path.name)
            log.info("Queue: %r", entity.to_dict())
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()