def run_enrich(scope_name: str, external_name: str, threshold: float): scope = Dataset.require(scope_name) external = Dataset.require(external_name) ctx = Context(external) resolver = get_resolver() database = Database(scope, resolver, cached=False) loader = database.view(scope) ctx.enrich(resolver, loader, threshold=threshold) resolver.save()
def run_pipeline( scope_name: str, crawl: bool = True, export: bool = True, threads: int = settings.THREADS, ) -> None: scope = Dataset.require(scope_name) with ThreadPoolExecutor(max_workers=threads) as executor: futures: List[Future] = [] if crawl is True: for source in scope.sources: ctx = Context(source) futures.append(executor.submit(ctx.crawl)) _compute_futures(futures) if export is True: resolver = get_resolver() with engine_tx() as conn: resolve_all_canonical(conn, resolver) database = Database(scope, resolver, cached=True) database.view(scope) futures = [] for dataset_ in scope.datasets: futures.append( executor.submit(export_dataset, dataset_, database)) futures.append(executor.submit(export_metadata)) _compute_futures(futures)
def export(dataset): resolver = get_resolver() Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def index(dataset): resolver = get_resolver() # Statement.resolve_all(resolver) dataset = Dataset.require(dataset) database = Database(dataset, resolver, cached=True) loader = database.view(dataset) path = get_index_path(dataset) path.unlink(missing_ok=True) get_index(dataset, loader)
def run(dataset): dataset = Dataset.require(dataset) resolver = get_resolver() for source in dataset.sources: Context(source).crawl() Statement.resolve_all(resolver) database = Database(dataset, resolver, cached=True) for dataset_ in dataset.datasets: export_dataset(dataset_, database) export_global_index()
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver) DedupeApp.run( title="OpenSanction De-duplication", # log="textual.log", loader=db.view(dataset), resolver=resolver, )
def dedupe(dataset): resolver = get_resolver() dataset = Dataset.require(dataset) db = Database(dataset, resolver, external=True) loader = db.view(dataset) async def run_app() -> None: app = DedupeApp( loader=loader, resolver=resolver, url_base="https://opensanctions.org/entities/%s/", title="OpenSanction De-duplication", log="textual.log", ) # type: ignore await app.process_messages() asyncio.run(run_app())
def feed(self, entity: Entity): if not entity.target: return countries = set(entity.get_type_values(registry.country)) identifiers = set(entity.get_type_values(registry.identifier)) names = set(entity.get_type_values(registry.name)) names.discard(entity.caption) sanctions = set() addresses = set(entity.get("address")) for _, adjacent in self.loader.get_adjacent(entity): if adjacent.schema.is_a("Sanction"): sanctions.add(self.sanction_text(adjacent)) if adjacent.schema.is_a("Address"): addresses.add(adjacent.caption) if adjacent.schema.is_a("Identification"): identifiers.update(adjacent.get("number")) countries.update(adjacent.get("country")) datasets: List[str] = [] for dataset in entity.datasets: ds = Dataset.require(dataset) datasets.append(ds.title) row = [ entity.id, entity.schema.name, entity.caption, self.concat_values(names), self.concat_values(entity.get("birthDate", quiet=True)), self.concat_values(countries), self.concat_values(addresses), self.concat_values(identifiers), self.concat_values(sanctions), self.concat_values(entity.get_type_values(registry.phone)), self.concat_values(entity.get_type_values(registry.email)), self.concat_values(datasets), entity.first_seen, entity.last_seen, ] self.writer.writerow(row)
def xref(dataset, limit): dataset = Dataset.require(dataset) blocking_xref(dataset, limit=limit)
def clear(dataset): dataset = Dataset.require(dataset) for source in dataset.sources: Context(source).clear()
def build_analytics_(dataset): ds = Dataset.require(dataset) build_analytics(ds)
def latest(dataset): ds = Dataset.require(dataset) with engine_tx() as conn: latest = max_last_seen(conn, ds) if latest is not None: print(latest.isoformat())
def xref_int(dataset): xref_internal(Dataset.require(dataset))
def crawl(dataset): dataset = Dataset.require(dataset) for source in dataset.sources: Context(source).crawl()
def dump_dataset(dataset, outfile): dataset = Dataset.require(dataset) resolver = get_resolver() loader = Database(dataset, resolver).view(dataset) for entity in loader: write_object(outfile, entity)
def export_pairs_(dataset, outfile): dataset = Dataset.require(dataset) for obj in export_pairs(dataset): write_object(outfile, obj)
def lookup(name, value): # We don't want to duplicate the lookup configs in both YAML files, # so we're hard-coding that lookups go against the SDN config. sdn = Dataset.require("us_ofac_sdn") return sdn.lookups.get(name).match(value)
def export_pairs_(dataset, outfile): dataset = Dataset.require(dataset) for obj in export_pairs(dataset): write_json(obj, outfile)
import json from banal import ensure_list from functools import lru_cache from pantomime.types import JSON from requests.exceptions import TooManyRedirects from opensanctions.core import Dataset from opensanctions import helpers as h FORMATS = ["%d %b %Y", "%d %B %Y", "%Y", "%b %Y", "%B %Y"] SDN = Dataset.require("us_ofac_sdn") @lru_cache(maxsize=None) def deref_url(context, url): try: res = context.http.get(url, stream=True) return res.url except TooManyRedirects: return url def parse_result(context, result): type_ = result.pop("type", None) schema = context.lookup_value("type", type_) if schema is None: context.log.error("Unknown result type", type=type_) return entity = context.make(schema) entity.id = context.make_slug(result.pop("id"))
def xref(base, candidates, limit=15): base_dataset = Dataset.require(base) candidates_dataset = Dataset.require(candidates) xref_datasets(base_dataset, candidates_dataset, limit=limit)