Ejemplo n.º 1
0
def build_analytics(dataset: Dataset):
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
    db = Database(dataset, resolver)
    loader = db.view(dataset)
    with engine_tx() as conn:
        conn.execute(delete(analytics_dataset_table))
        conn.execute(delete(analytics_country_table))
        conn.execute(delete(analytics_entity_table))

        entities: List[Dict[str, Any]] = []
        members: List[Dict[str, str]] = []
        countries: List[Dict[str, str]] = []
        for idx, entity in enumerate(loader):
            if idx > 0 and idx % 10000 == 0:
                log.info("Denormalised %d entities..." % idx)

            for dataset in Dataset.all():
                if len(entity.datasets.intersection(dataset.scope_names)) > 0:
                    members.append({"entity_id": entity.id, "dataset": dataset.name})

            if len(members) >= BATCH_SIZE:
                stmt = insert(analytics_dataset_table).values(members)
                conn.execute(stmt)
                members = []

            for country in entity.get_type_values(registry.country):
                countries.append({"entity_id": entity.id, "country": country})

            if len(countries) >= BATCH_SIZE:
                stmt = insert(analytics_country_table).values(countries)
                conn.execute(stmt)
                countries = []

            ent = {
                "id": entity.id,
                "schema": entity.schema.name,
                "caption": entity.caption,
                "target": entity.target,
                "first_seen": entity.first_seen,
                "last_seen": entity.last_seen,
                "properties": entity.properties,
            }
            entities.append(ent)

            if len(entities) >= BATCH_SIZE:
                stmt = insert(analytics_entity_table).values(entities)
                conn.execute(stmt)
                entities = []

        if len(members):
            conn.execute(insert(analytics_dataset_table).values(members))

        if len(entities):
            conn.execute(insert(analytics_entity_table).values(entities))
Ejemplo n.º 2
0
 def clear(self) -> None:
     """Delete all recorded data for a given dataset."""
     with engine_tx() as conn:
         clear_statements(conn, self.dataset)
         clear_issues(conn, self.dataset)
         clear_resources(conn, self.dataset)
     self.cache.clear()
Ejemplo n.º 3
0
def run_pipeline(
    scope_name: str,
    crawl: bool = True,
    export: bool = True,
    threads: int = settings.THREADS,
) -> None:
    scope = Dataset.require(scope_name)
    with ThreadPoolExecutor(max_workers=threads) as executor:
        futures: List[Future] = []
        if crawl is True:
            for source in scope.sources:
                ctx = Context(source)
                futures.append(executor.submit(ctx.crawl))
            _compute_futures(futures)

        if export is True:
            resolver = get_resolver()
            with engine_tx() as conn:
                resolve_all_canonical(conn, resolver)
            database = Database(scope, resolver, cached=True)
            database.view(scope)
            futures = []
            for dataset_ in scope.datasets:
                futures.append(
                    executor.submit(export_dataset, dataset_, database))
            futures.append(executor.submit(export_metadata))
            _compute_futures(futures)
Ejemplo n.º 4
0
def explode(canonical_id):
    resolver = get_resolver()
    resolved_id = resolver.get_canonical(canonical_id)
    with engine_tx() as conn:
        for entity_id in resolver.explode(resolved_id):
            log.info("Restore separate entity", entity=entity_id)
            resolve_canonical(conn, resolver, entity_id)
    resolver.save()
Ejemplo n.º 5
0
    def enrich(
        self,
        resolver: Resolver,
        entities: Iterable[Entity],
        threshold: Optional[float] = None,
    ):
        """Try to match a set of entities against an external source."""
        self.bind()
        with engine_tx() as conn:
            clear_issues(conn, self.dataset)
            clear_resources(conn, self.dataset)
            clear_statements(conn, self.dataset)
        external = cast(External, self.dataset)
        enricher = external.get_enricher(self.cache)
        try:
            for entity in entities:
                try:
                    for match in enricher.match_wrapped(entity):
                        judgement = resolver.get_judgement(match.id, entity.id)

                        # For unjudged candidates, compute a score and put it in the
                        # xref cache so the user can decide:
                        if judgement == Judgement.NO_JUDGEMENT:
                            if not entity.schema.can_match(match.schema):
                                continue
                            result = compare_scored(entity, match)
                            score = result["score"]
                            if threshold is None or score >= threshold:
                                self.log.info("Match [%s]: %.2f -> %s" %
                                              (entity, score, match))
                                resolver.suggest(
                                    entity.id,
                                    match.id,
                                    score,
                                    user=AUTO_USER,
                                )

                        if judgement != Judgement.POSITIVE:
                            self.emit(match, external=True)

                        # Store previously confirmed matches to the database and make
                        # them visible:
                        if judgement == Judgement.POSITIVE:
                            self.log.info("Enrich [%s]: %r" % (entity, match))
                            for adjacent in enricher.expand_wrapped(
                                    entity, match):
                                if check_person_cutoff(adjacent):
                                    continue
                                # self.log.info("Added", entity=adjacent)
                                self.emit(adjacent)
                except Exception:
                    self.log.exception("Could not match: %r" % entity)
        except KeyboardInterrupt:
            pass
        finally:
            self.flush()
            enricher.close()
            self.close()
Ejemplo n.º 6
0
 def flush(self) -> None:
     """Emitted entities are de-constructed into statements for the database
     to store. These are inserted in batches - so the statement cache on the
     context is flushed to the store. All statements that are not flushed
     when a crawl is aborted are not persisted to the database."""
     statements = list(self._statements.values())
     with engine_tx() as conn:
         for i in range(0, len(statements), self.BATCH_SIZE):
             batch = statements[i:i + self.BATCH_SIZE]
             save_statements(conn, batch)
     self._statements = {}
Ejemplo n.º 7
0
    def crawl(self) -> None:
        """Run the crawler."""
        self.bind()
        with engine_tx() as conn:
            clear_issues(conn, self.dataset)
        if self.dataset.disabled:
            self.log.info("Source is disabled")
            return
        with engine_tx() as conn:
            clear_resources(conn, self.dataset)
        self.log.info("Begin crawl")
        try:
            # Run the dataset:
            self.dataset.method(self)
            self.flush()
            with engine_tx() as conn:
                cleanup_dataset(conn, self.dataset)
                entities = count_entities(conn, dataset=self.dataset)
                targets = count_entities(conn,
                                         dataset=self.dataset,
                                         target=True)

            self.log.info("Crawl completed",
                          entities=entities,
                          targets=targets)
        except KeyboardInterrupt:
            raise
        except LookupException as exc:
            self.log.error(exc.message,
                           lookup=exc.lookup.name,
                           value=exc.value)
            raise
        except Exception:
            self.log.exception("Crawl failed")
            raise
        finally:
            self.close()
Ejemplo n.º 8
0
 def decide(
     self,
     left_id: StrIdent,
     right_id: StrIdent,
     judgement: Judgement,
     user: Optional[str] = None,
     score: Optional[float] = None,
 ) -> Identifier:
     target = super().decide(left_id,
                             right_id,
                             judgement,
                             user=user,
                             score=score)
     if judgement == Judgement.POSITIVE:
         with engine_tx() as conn:
             resolve_canonical(conn, self, target.id)
     return target
Ejemplo n.º 9
0
def store_log_event(logger, log_method, data: Dict[str,
                                                   Any]) -> Dict[str, Any]:
    for key, value in data.items():
        if isinstance(value, _Element):
            value = tostring(value, pretty_print=False, encoding=str)
        if isinstance(value, Path):
            value = str(value.relative_to(settings.DATA_PATH))
        if isinstance(value, Schema):
            value = value.name
        data[key] = value

    dataset = data.get("dataset", None)
    level = data.get("level")
    if level is not None:
        level_num = getattr(logging, level.upper())
        if level_num > logging.INFO and dataset is not None:
            with engine_tx() as conn:
                save_issue(conn, data)
    return data
Ejemplo n.º 10
0
    def export_resource(self, path, mime_type=None, title=None):
        """Register a file as a documented file exported by the dataset."""
        if mime_type is None:
            mime_type, _ = mimetypes.guess(path)

        digest = hashlib.sha1()
        size = 0
        with open(path, "rb") as fh:
            while True:
                chunk = fh.read(65536)
                if not chunk:
                    break
                size += len(chunk)
                digest.update(chunk)
        if size == 0:
            self.log.warning("Resource is empty", path=path)
        checksum = digest.hexdigest()
        name = path.relative_to(self.path).as_posix()
        with engine_tx() as conn:
            return save_resource(conn, name, self.dataset, checksum, mime_type,
                                 size, title)
Ejemplo n.º 11
0
def resolve():
    resolver = get_resolver()
    with engine_tx() as conn:
        resolve_all_canonical(conn, resolver)
Ejemplo n.º 12
0
def latest(dataset):
    ds = Dataset.require(dataset)
    with engine_tx() as conn:
        latest = max_last_seen(conn, ds)
        if latest is not None:
            print(latest.isoformat())