def sieve(infile, outfile, schema, property, type): try: for entity in read_entities(infile): entity = sieve_entity(entity, schema, property, type) if entity is not None: write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def import_cellebrite(infile, outfile, owner, country): try: converter = CellebriteConverter(infile, owner, country) for entity in converter.convert(): if entity.id is not None: write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def enrich(infile, outfile, enricher): enricher = load_enricher(enricher) try: for entity in read_entities(infile): for match in enricher.enrich_entity_raw(entity): write_object(outfile, match) except BrokenPipeError: raise click.Abort()
def sign(infile, outfile, signature): ns = Namespace(signature) try: for entity in read_entities(infile): signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def expand(infile, outfile, enricher): enricher = load_enricher(enricher) try: for entity in read_entities(infile): for expanded in enricher.expand_entity(entity): write_object(outfile, expanded) except BrokenPipeError: raise click.Abort()
def match_decide(infile, outfile, threshold): try: for match in Match.from_file(model, infile): if match.decision is None: if match.score is not None and match.score > threshold: match.decision = True write_object(outfile, match) except BrokenPipeError: raise click.Abort()
def validate(infile, outfile): try: for entity in read_entities(infile, cleaned=False): clean = model.make_entity(entity.schema) clean.id = entity.id for (prop, value) in entity.itervalues(): clean.add(prop, value) write_object(outfile, clean) except BrokenPipeError: raise click.Abort()
def expand(infile, outfile, enricher): enricher = load_enricher(enricher) try: while True: entity = read_entity(infile) if entity is None: break for entity in enricher.expand_entity(entity): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def enrich(infile, outfile, enricher): enricher = load_enricher(enricher) try: while True: entity = read_entity(infile) if entity is None: break for match in enricher.enrich_entity_raw(entity): write_object(outfile, match) except BrokenPipeError: raise click.Abort()
def sign(infile, outfile, signature): ns = Namespace(signature) try: while True: entity = read_entity(infile) if entity is None: break signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def link(infile, outfile, matches): try: linker = Linker(model) for match in Match.from_file(model, matches): linker.add(match) log.info("Linker: %s clusters.", len(linker.lookup)) for entity in read_entities(infile): entity = linker.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def match_entities(infile, outfile, all): try: for match in Match.from_file(model, infile): if not all and match.decision is not True: continue if match.canonical is not None: write_object(outfile, match.canonical) if match.entity is not None: write_object(outfile, match.entity) except BrokenPipeError: raise click.Abort()
def run_mapping(outfile, mapping_yaml): config = load_mapping_file(mapping_yaml) try: for dataset, meta in config.items(): for mapping in keys_values(meta, 'queries', 'query'): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: write_object(outfile, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def make_entities(db_path, outfile): db = dataset.connect("sqlite:///%s" % db_path) store = Dataset("temp", database_uri="sqlite://") writer = store.bulk() write_edges(writer, db) write_addresses(writer, db) write_nodes(writer, db["entity"], "Company") write_nodes(writer, db["intermediary"]) write_nodes(writer, db["officer"]) for entity in store.iterate(): write_object(outfile, entity)
def import_ocds(infile, outfile): try: while True: line = infile.readline() if not line: return record = json.loads(line) for entity in convert_record(record): if entity.id is not None: write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def result_entities(): try: stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: result = read_result(stdin) if result is None: break for entity in result.entities: write_object(stdout, entity) except BrokenPipeError: raise click.Abort()
def validate(infile, outfile): try: while True: entity = read_entity(infile) if entity is None: break clean = model.make_entity(entity.schema) clean.id = entity.id for (prop, value) in entity.itervalues(): clean.add(prop, value) write_object(outfile, clean) except BrokenPipeError: raise click.Abort()
def auto_match(threshold): try: stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: result = read_result(stdin) if result is None: break if result.score > threshold: recon = Recon(result.subject, result.candidate, Recon.MATCH) write_object(stdout, recon) except BrokenPipeError: raise click.Abort()
def import_ocds(): stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') try: while True: line = stdin.readline() if not line: return record = json.loads(line) for entity in convert_record(record): if entity.id is not None: write_object(stdout, entity) except BrokenPipeError: raise click.Abort()
def run_mapping(outfile, mapping_yaml, sign=True): config = load_mapping_file(mapping_yaml) try: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def expand(enricher): enricher = load_enricher(enricher) try: stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: entity = read_entity(stdin) if entity is None: break result = enricher.expand_entity(entity) write_object(stdout, result) except BrokenPipeError: raise click.Abort() finally: enricher.close()
def aggregate(infile, outfile): buffer = {} namespace = Namespace(None) try: for entity in read_entities(infile): entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def export(self): """Generate exported files for the dataset.""" try: self.bind() ftm_path = self.get_artifact_path("entities.ftm.json") ftm_path.parent.mkdir(exist_ok=True, parents=True) self.log.info( "Writing entities to line-based JSON", path=ftm_path, entities=len(self.dataset.store), ) with open(ftm_path, "w") as fh: for entity in self.dataset.store: write_object(fh, entity) finally: self.close()
def stream_mapping(infile, outfile, mapping_yaml): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: for record in StreamSource.read_csv(infile): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def filter_results(recon): try: matches = set() for recon in Recon.from_file(recon): if recon.judgement == Recon.MATCH: matches.add(recon.subject) stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: result = read_result(stdin) if result is None: break if result.candidate is None: continue candidate = NS.apply(result.candidate) if candidate.id in matches: write_object(stdout, result) except BrokenPipeError: raise click.Abort()
def aggregate(): buffer = {} namespace = Namespace(None) try: stdin = click.get_text_stream('stdin') while True: entity = read_entity(stdin) if entity is None: break entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity stdout = click.get_text_stream('stdout') for entity in buffer.values(): write_object(stdout, entity) except BrokenPipeError: raise click.Abort()
def apply_recon(recon): try: linker = EntityLinker() for recon in Recon.from_file(recon): if recon.judgement == Recon.MATCH: linker.add(recon.subject, recon.canonical) log.info("Linker: %s clusters.", len(linker.clusters)) stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: entity = read_entity(stdin) if entity is None: break entity = NS.apply(entity) outgoing = linker.apply(entity) if outgoing.id != entity.id: outgoing.add('sameAs', entity.id, quiet=True) write_object(stdout, outgoing) except BrokenPipeError: raise click.Abort()
def stream_mapping(infile, outfile, mapping_yaml, sign=True): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append((dataset, source)) try: for record in StreamSource.read_csv(infile): for (dataset, source) in sources: ns = Namespace(dataset) if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def stream_mapping(infile, outfile, mapping_yaml, sign=True): queries: List[Tuple[str, QueryMapping]] = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): data.pop("database", None) data["csv_url"] = "/dev/null" query = model.make_mapping(data, key_prefix=dataset) queries.append((dataset, query)) try: for record in CSVSource.read_csv(infile): for (dataset, query) in queries: ns = Namespace(dataset) if query.source.check_filters(record): entities = query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def dump_entities(foreign_id, outfile): """Export FtM entities for the given collection.""" collection = get_collection(foreign_id) for entity in iter_proxies(collection_id=collection.id, excludes=['text']): write_object(outfile, entity)