Beispiel #1
0
 def crawl(self) -> None:
     """Run the crawler."""
     try:
         self.bind()
         Issue.clear(self.dataset)
         Resource.clear(self.dataset)
         db.session.commit()
         self.log.info("Begin crawl")
         # Run the dataset:
         self.dataset.method(self)
         self.flush()
         Statement.cleanup_dataset(self.dataset)
         self.log.info(
             "Crawl completed",
             entities=Statement.all_counts(dataset=self.dataset),
             targets=Statement.all_counts(dataset=self.dataset, target=True),
         )
     except KeyboardInterrupt:
         db.session.rollback()
         raise
     except LookupException as exc:
         db.session.rollback()
         self.log.error(exc.message, lookup=exc.lookup.name, value=exc.value)
     except Exception:
         db.session.rollback()
         self.log.exception("Crawl failed")
     finally:
         self.close()
Beispiel #2
0
 def flush(self) -> None:
     """Emitted entities are de-constructed into statements for the database
     to store. These are inserted in batches - so the statement cache on the
     context is flushed to the store. All statements that are not flushed
     when a crawl is aborted are not persisted to the database."""
     self.log.debug("Flushing statements to database...")
     Statement.upsert_many(list(self._statements.values()))
     self._statements = {}
Beispiel #3
0
 def query(self,
           dataset: Dataset,
           entity_id=None,
           inverted_id=None) -> Generator[CachedEntity, None, None]:
     """Query the statement table for the given dataset and entity ID and return
     an entity cache object with the given properties."""
     canonical_id = None
     if entity_id is not None:
         canonical_id = self.resolver.get_canonical(entity_id)
     inverted_ids = None
     if inverted_id is not None:
         inverted_ids = self.resolver.get_referents(inverted_id)
     current_id = None
     types: List[CachedType] = []
     props: List[CachedProp] = []
     q = Statement.all_statements(
         dataset=dataset,
         canonical_id=canonical_id,
         inverted_ids=inverted_ids,
     )
     for stmt in q:
         if stmt.canonical_id != current_id:
             if len(types):
                 yield (tuple(types), tuple(props))
             types = []
             props = []
         current_id = stmt.canonical_id
         if stmt.prop == Statement.BASE:
             types.append(CachedType(stmt))
         else:
             props.append(CachedProp(stmt))
     if len(types):
         yield (tuple(types), tuple(props))
Beispiel #4
0
 def decide(
     self,
     left_id: StrIdent,
     right_id: StrIdent,
     judgement: Judgement,
     user: Optional[str] = None,
     score: Optional[float] = None,
 ) -> Identifier:
     target = super().decide(left_id,
                             right_id,
                             judgement,
                             user=user,
                             score=score)
     if judgement == Judgement.POSITIVE:
         Statement.resolve(self, target.id)
     return target
Beispiel #5
0
def export_global_index():
    """Export the global index for all datasets."""
    datasets = []
    for dataset in Dataset.all():
        datasets.append(dataset.to_index())

    issues_path = settings.DATASET_PATH.joinpath("issues.json")
    log.info("Writing global issues list", path=issues_path)
    with open(issues_path, "w", encoding=settings.ENCODING) as fh:
        data = {"issues": Issue.query().all()}
        write_json(data, fh)

    index_path = settings.DATASET_PATH.joinpath("index.json")
    log.info("Writing global index", datasets=len(datasets), path=index_path)
    with open(index_path, "w", encoding=settings.ENCODING) as fh:
        meta = {
            "datasets": datasets,
            "run_time": settings.RUN_TIME,
            "dataset_url": settings.DATASET_URL,
            "issues_url": urljoin(settings.DATASET_URL, "issues.json"),
            "model": model,
            "schemata": Statement.all_schemata(),
            "app": "opensanctions",
            "version": settings.VERSION,
        }
        write_json(meta, fh)
Beispiel #6
0
def get_schemata(dataset: Dataset) -> List[Schema]:
    schemata: List[Schema] = list()
    names = Statement.all_schemata(dataset=dataset)
    for name in names:
        schema = model.get(name)
        if schema is not None:
            schemata.append(schema)
    return schemata
Beispiel #7
0
 def check_candidate(self, left: Identifier, right: Identifier) -> bool:
     if not super().check_candidate(left, right):
         return False
     lefts = [c.id for c in self.connected(left)]
     rights = [c.id for c in self.connected(right)]
     if Statement.unique_conflict(lefts, rights):
         self.decide(left, right, Judgement.NEGATIVE, user="******")
         return False
     return True
Beispiel #8
0
 def get_target_countries(self) -> List[Dict[str, Any]]:
     countries = []
     for code, count in Statement.agg_target_by_country(dataset=self):
         result = {
             "code": code,
             "count": count,
             "label": registry.country.caption(code),
         }
         countries.append(result)
     return countries
Beispiel #9
0
    def to_index(self) -> Dict[str, Any]:
        meta = self.to_dict()
        meta["index_url"] = self.make_public_url("index.json")
        meta["issues_url"] = self.make_public_url("issues.json")
        meta["issue_levels"] = Issue.agg_by_level(dataset=self)
        meta["issue_count"] = sum(meta["issue_levels"].values())
        meta["target_count"] = Statement.all_counts(dataset=self, target=True)
        meta["last_change"] = Statement.max_last_seen(dataset=self)
        meta["last_export"] = settings.RUN_TIME

        meta["targets"] = {
            "countries": self.get_target_countries(),
            "schemata": self.get_target_schemata(),
        }
        meta["resources"] = []
        for resource in Resource.query(dataset=self):
            res = resource.to_dict()
            res["url"] = self.make_public_url(resource.path)
            meta["resources"].append(res)
        return meta
Beispiel #10
0
def export_pairs(dataset: Dataset):
    resolver = get_resolver()
    db = Database(dataset, resolver, cached=True)
    datasets: Dict[str, Dataset] = defaultdict(set)
    for entity_id, ds in Statement.entities_datasets(dataset):
        dsa = Dataset.get(ds)
        if dsa is not None:
            datasets[entity_id].add(dsa)

    def get_parts(id):
        canonical_id = resolver.get_canonical(id)
        for ref in resolver.get_referents(canonical_id):
            for ds in datasets.get(ref, []):
                yield ref, ds

    pairs: Dict[Tuple[Tuple[str, Dataset], Tuple[str, Dataset]],
                Judgement] = {}
    for canonical_id in resolver.canonicals():
        parts = list(get_parts(canonical_id))
        for left, right in combinations(parts, 2):
            left, right = max(left, right), min(left, right)
            pairs[(left, right)] = Judgement.POSITIVE
        for edge in resolver.nodes[canonical_id]:
            if edge.judgement == Judgement.NEGATIVE:
                source_canonical = resolver.get_canonical(edge.source)
                other = edge.target if source_canonical == canonical_id else edge.source
                for other_part in get_parts(other):
                    for part in parts:
                        part, other_part = max(part, other_part), min(
                            part, other_part)
                        pairs[(part, other_part)] = Judgement.NEGATIVE

    def get_partial(spec):
        id, ds = spec
        loader = db.view(ds)
        canonical = resolver.get_canonical(id)
        entity = loader.get_entity(canonical)
        if entity is not None:
            return entity.to_nested_dict(loader)

    for (left, right), judgement in pairs.items():
        # yield [left[0], right[0], judgement]
        left_entity = get_partial(left)
        right_entity = get_partial(right)
        if left_entity is not None and right_entity is not None:
            yield {
                "left": left_entity,
                "right": right_entity,
                "judgement": judgement
            }
Beispiel #11
0
 def get_target_schemata(self) -> List[Dict[str, Any]]:
     schemata = []
     for name, count in Statement.agg_target_by_schema(dataset=self):
         schema = model.get(name)
         if schema is None:
             continue
         result = {
             "name": name,
             "count": count,
             "label": schema.label,
             "plural": schema.plural,
         }
         schemata.append(result)
     return schemata
Beispiel #12
0
 def emit(self, entity: Entity, target: Optional[bool] = None, unique: bool = False):
     """Send an FtM entity to the store."""
     if entity.id is None:
         raise ValueError("Entity has no ID: %r", entity)
     if target is not None:
         entity.target = target
     statements = Statement.from_entity(
         entity, self.dataset, self.resolver, unique=unique
     )
     if not len(statements):
         raise ValueError("Entity has no properties: %r", entity)
     for stmt in statements:
         key = (stmt["entity_id"], stmt["prop"], stmt["value"])
         self._statements[key] = stmt
     if len(self._statements) >= db.batch_size:
         self.flush()
     self.log.debug("Emitted", entity=entity)
Beispiel #13
0
 def __len__(self) -> int:
     return Statement.all_ids(self.dataset).count()
Beispiel #14
0
 def clear(self) -> None:
     """Delete all recorded data for a given dataset."""
     Issue.clear(self.dataset)
     Statement.clear(self.dataset)
     db.session.commit()