Example #1
0
def analyze(dataset):
    db = get_dataset(dataset, OP_ANALYZE)
    analyzer = None
    for entity in db.partials():
        if analyzer is None or analyzer.entity.id != entity.id:
            if analyzer is not None:
                analyzer.flush()
            # log.debug("Analyze: %r", entity)
            analyzer = Analyzer(db, entity, {})
        analyzer.feed(entity)
    if analyzer is not None:
        analyzer.flush()
Example #2
0
 def handle(self, task):
     apply_task_context(task, version=__version__)
     name = task.context.get("ftmstore", task.job.dataset.name)
     dataset = get_dataset(name, task.stage.stage)
     if task.stage.stage == OP_INGEST:
         entity_ids = self._ingest(dataset, task)
         payload = {"entity_ids": entity_ids}
         self.dispatch_pipeline(task, payload)
     elif task.stage.stage == OP_ANALYZE:
         entity_ids = self._analyze(dataset, task)
         payload = {"entity_ids": entity_ids}
         self.dispatch_pipeline(task, payload)
Example #3
0
def debug(path, languages=None):
    """Debug the ingest for the given path."""
    conn = get_fakeredis()
    settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3"
    db = get_dataset("debug",
                     origin=OP_INGEST,
                     database_uri=settings.fts.DATABASE_URI)
    db.delete()
    _ingest_path(db, conn, "debug", path, languages=languages)
    worker = IngestWorker(conn=conn, stages=STAGES)
    worker.sync()
    for entity in db.iterate():
        pprint(entity.to_dict())
Example #4
0
 def setUp(self):
     # Force tests to use fake configuration
     ingestors_settings.TESTING = True
     service_settings.REDIS_URL = None
     service_settings.ARCHIVE_TYPE = "file"
     service_settings.ARCHIVE_PATH = mkdtemp()
     ftmstore_settings.DATABASE_URI = "sqlite://"
     conn = get_fakeredis()
     job = Job.create(conn, "test")
     stage = Stage(job, OP_INGEST)
     dataset = get_dataset(job.dataset.name, origin=OP_INGEST)
     Tags("ingest_cache").delete()
     self.manager = Manager(dataset, stage, {})
     self.manager.entities = []
     self.manager.emit_entity = types.MethodType(emit_entity, self.manager)
     self.manager.queue_entity = types.MethodType(queue_entity,
                                                  self.manager)
     self.archive = init_archive()
     self.manager._archive = self.archive
Example #5
0
def ingest(path, dataset, languages=None):
    """Queue a set of files for ingest."""
    conn = get_redis()
    db = get_dataset(dataset, OP_INGEST)
    _ingest_path(db, conn, dataset, path, languages=languages)
Example #6
0
def get_aggregator(collection, origin="aleph"):
    """Connect to a followthemoney dataset."""
    dataset = get_aggregator_name(collection)
    return get_dataset(dataset, origin=origin)