def debug(path, dataset, languages=None): """Debug the ingest for the given path.""" conn = get_fakeredis() settings.sts.DATABASE_URI = 'sqlite://' db = get_dataset(dataset, OP_INGEST) _ingest_path(db, conn, dataset, path, languages=languages) worker = IngestWorker(conn=conn, stages=STAGES) worker.sync() for entity in db.iterate(): pprint(entity.to_dict())
def handle(self, task): name = task.context.get("ftmstore", task.job.dataset.name) dataset = get_dataset(name, task.stage.stage) try: if task.stage.stage == OP_INGEST: entity_ids = self._ingest(dataset, task) self.dispatch_next(task, entity_ids) elif task.stage.stage == OP_ANALYZE: entity_ids = self._analyze(dataset, task) self.dispatch_next(task, entity_ids) finally: dataset.close()
def analyze(dataset): db = get_dataset(dataset, OP_ANALYZE) analyzer = None for entity in db.partials(): if analyzer is None or analyzer.entity.id != entity.id: if analyzer is not None: analyzer.flush() # log.debug("Analyze: %r", entity) analyzer = Analyzer(db, entity, {}) analyzer.feed(entity) if analyzer is not None: analyzer.flush()
def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = 'file' service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = 'sqlite://' conn = get_fakeredis() job = Job.create(conn, 'test') stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, OP_INGEST) self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) # noqa self.archive = init_archive() self.manager._archive = self.archive
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" conn = get_redis() db = get_dataset(dataset, OP_INGEST) _ingest_path(db, conn, dataset, path, languages=languages)