def analyze(dataset): db = get_dataset(dataset, OP_ANALYZE) analyzer = None for entity in db.partials(): if analyzer is None or analyzer.entity.id != entity.id: if analyzer is not None: analyzer.flush() # log.debug("Analyze: %r", entity) analyzer = Analyzer(db, entity, {}) analyzer.feed(entity) if analyzer is not None: analyzer.flush()
def handle(self, task): apply_task_context(task, version=__version__) name = task.context.get("ftmstore", task.job.dataset.name) dataset = get_dataset(name, task.stage.stage) if task.stage.stage == OP_INGEST: entity_ids = self._ingest(dataset, task) payload = {"entity_ids": entity_ids} self.dispatch_pipeline(task, payload) elif task.stage.stage == OP_ANALYZE: entity_ids = self._analyze(dataset, task) payload = {"entity_ids": entity_ids} self.dispatch_pipeline(task, payload)
def debug(path, languages=None): """Debug the ingest for the given path.""" conn = get_fakeredis() settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3" db = get_dataset("debug", origin=OP_INGEST, database_uri=settings.fts.DATABASE_URI) db.delete() _ingest_path(db, conn, "debug", path, languages=languages) worker = IngestWorker(conn=conn, stages=STAGES) worker.sync() for entity in db.iterate(): pprint(entity.to_dict())
def setUp(self): # Force tests to use fake configuration ingestors_settings.TESTING = True service_settings.REDIS_URL = None service_settings.ARCHIVE_TYPE = "file" service_settings.ARCHIVE_PATH = mkdtemp() ftmstore_settings.DATABASE_URI = "sqlite://" conn = get_fakeredis() job = Job.create(conn, "test") stage = Stage(job, OP_INGEST) dataset = get_dataset(job.dataset.name, origin=OP_INGEST) Tags("ingest_cache").delete() self.manager = Manager(dataset, stage, {}) self.manager.entities = [] self.manager.emit_entity = types.MethodType(emit_entity, self.manager) self.manager.queue_entity = types.MethodType(queue_entity, self.manager) self.archive = init_archive() self.manager._archive = self.archive
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" conn = get_redis() db = get_dataset(dataset, OP_INGEST) _ingest_path(db, conn, dataset, path, languages=languages)
def get_aggregator(collection, origin="aleph"): """Connect to a followthemoney dataset.""" dataset = get_aggregator_name(collection) return get_dataset(dataset, origin=origin)