def run_mapping(mapping_yaml): config = load_mapping_file(mapping_yaml) stream = click.get_text_stream('stdout') try: for dataset, meta in config.items(): for mapping in keys_values(meta, 'queries', 'query'): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: read_entity(stream, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def read_entities(file_name): with open(file_name) as fh: while True: entity = read_entity(fh) if entity is None: break yield entity
def read_entities(file_name): now = datetime.utcnow() with open(file_name) as fh: while True: entity = read_entity(fh) if entity is None: break entity.set('indexUpdatedAt', now, quiet=True) yield entity
def read_entities(file_name): now = datetime.utcnow().isoformat() with open(file_name) as fh: while True: entity = read_entity(fh) if entity is None: break entity.context["updated_at"] = now yield entity
def load_entities(entities): session = Session() try: while True: entity = read_entity(entities) if entity is None: break Entity.save(session, entities.name, entity) except BrokenPipeError: pass session.commit()
def read_entities(file_name): now = datetime.utcnow() entities = [] with open(file_name) as fh: while True: entity = read_entity(fh) if entity is None: break entity.set('indexUpdatedAt', now, quiet=True) entities.append(entity) return entities
def pretty(infile): stdout = click.get_text_stream('stdout') try: while True: entity = read_entity(infile) if entity is None: break data = json.dumps(entity.to_dict(), indent=2) stdout.write(data + '\n') except BrokenPipeError: raise click.Abort()
def sign(infile, outfile, signature): ns = Namespace(signature) try: while True: entity = read_entity(infile) if entity is None: break signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def sieve(infile, outfile, schema, property, type): try: while True: entity = read_entity(infile) if entity is None: break entity = sieve_entity(entity, schema, property, type) if entity is not None: write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def enrich(infile, outfile, enricher): enricher = load_enricher(enricher) try: while True: entity = read_entity(infile) if entity is None: break for match in enricher.enrich_entity_raw(entity): write_object(outfile, match) except BrokenPipeError: raise click.Abort()
def expand(infile, outfile, enricher): enricher = load_enricher(enricher) try: while True: entity = read_entity(infile) if entity is None: break for entity in enricher.expand_entity(entity): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def export_cypher(): stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') exporter = CypherGraphExport(stdout) try: while True: entity = read_entity(stdin) if entity is None: break exporter.write(entity) except BrokenPipeError: raise click.Abort()
def stream_mapping(mapping_yaml): stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: for record in StreamSource.read_csv(stdin): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): read_entity(stdout, entity) except BrokenPipeError: raise click.Abort()
def export_excel(filename): stdin = click.get_text_stream('stdin') workbook = get_workbook() try: while True: entity = read_entity(stdin) if entity is None: break write_entity(workbook, entity) workbook.save(filename) except BrokenPipeError: raise click.Abort()
def match(cls, file_path, entity): score = super(FtMIngestor, cls).match(file_path, entity) if score < 1: return score try: with open(file_path, "rb") as fh: proxy = read_entity(fh, max_line=100 * MEGABYTE) if isinstance(proxy, EntityProxy) and proxy.id is not None: return cls.SCORE except Exception: log.exception("Failed to read FtM file: %r", entity) return -1
def validate(infile, outfile): try: while True: entity = read_entity(infile) if entity is None: break clean = model.make_entity(entity.schema) clean.id = entity.id for (prop, value) in entity.itervalues(): clean.add(prop, value) write_object(outfile, clean) except BrokenPipeError: raise click.Abort()
def link(infile, outfile, matches): try: linker = Linker(model) for match in Match.from_file(model, matches): linker.add(match) log.info("Linker: %s clusters.", len(linker.lookup)) while True: entity = read_entity(infile) if entity is None: break entity = linker.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def export_csv(outdir): stdin = click.get_text_stream('stdin') handlers = {} try: while True: entity = read_entity(stdin) if entity is None: break fh = _get_csv_handler(outdir, entity.schema, handlers) write_entity(fh, entity) except BrokenPipeError: raise click.Abort() finally: for fh in handlers.values(): fh.close()
def load_votes(votes): session = Session() try: while True: data = read_entity(votes) if data is None: break Vote.save(session, data.get('match_id'), data.get('user'), data.get('judgement')) except BrokenPipeError: pass Match.tally(session) session.commit()
def expand(enricher): enricher = load_enricher(enricher) try: stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: entity = read_entity(stdin) if entity is None: break result = enricher.expand_entity(entity) write_object(stdout, result) except BrokenPipeError: raise click.Abort() finally: enricher.close()
def export_gexf(): stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') graph = nx.MultiDiGraph() exporter = NXGraphExport(graph) try: while True: entity = read_entity(stdin) if entity is None: break exporter.write(entity) except BrokenPipeError: raise click.Abort() for line in generate_gexf(graph, prettyprint=False): stdout.write(line)
def aggregate(infile, outfile): buffer = {} namespace = Namespace(None) try: while True: entity = read_entity(infile) if entity is None: break entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def aggregate(): buffer = {} namespace = Namespace(None) try: stdin = click.get_text_stream('stdin') while True: entity = read_entity(stdin) if entity is None: break entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity stdout = click.get_text_stream('stdout') for entity in buffer.values(): write_object(stdout, entity) except BrokenPipeError: raise click.Abort()
def apply_recon(recon): try: linker = EntityLinker() for recon in Recon.from_file(recon): if recon.judgement == Recon.MATCH: linker.add(recon.subject, recon.canonical) log.info("Linker: %s clusters.", len(linker.clusters)) stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') while True: entity = read_entity(stdin) if entity is None: break entity = NS.apply(entity) outgoing = linker.apply(entity) if outgoing.id != entity.id: outgoing.add('sameAs', entity.id, quiet=True) write_object(stdout, outgoing) except BrokenPipeError: raise click.Abort()