def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def __init__(self, dataset, entity, context): self.dataset = dataset self.ns = Namespace(context.get("namespace", dataset.name)) self.entity = model.make_entity(entity.schema) self.entity.id = entity.id self.aggregator_entities = TagAggregatorFasttext() self.aggregator_patterns = TagAggregator()
def stream_mapping(infile: Path, outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: queries: List[Tuple[str, QueryMapping]] = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): data.pop("database", None) data["csv_url"] = "/dev/null" query = model.make_mapping(data, key_prefix=dataset) queries.append((dataset, query)) try: with path_writer(outfile) as outfh: with input_file(infile) as fh: for record in CSVSource.read_csv(fh): for (dataset, query) in queries: ns = Namespace(dataset) if query.source.check_filters(record): # type: ignore entities = query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def upgrade(): bind = op.get_bind() meta = sa.MetaData() meta.bind = bind meta.reflect() entity_table = meta.tables['entity'] collection_table = meta.tables['collection'] q = sa.select([collection_table]) crp = bind.execute(q) for collection in crp.fetchall(): ns = Namespace(collection.foreign_id) q = sa.select([entity_table]) q = q.where(entity_table.c.collection_id == collection.id) erp = bind.execute(q) while True: entity = erp.fetchone() if not entity: break proxy = model.get_proxy( { 'id': entity.id, 'schema': entity.schema, 'properties': entity.data }, cleaned=False) proxy.add('name', entity.name, quiet=True, cleaned=False) proxy = ns.apply(proxy) q = sa.update(entity_table) q = q.where(entity_table.c.id == entity.id) q = q.values(id=proxy.id, data=proxy.properties) bind.execute(q) op.drop_column('entity', 'foreign_id') op.drop_column('entity', 'name')
def bulk_load_query(queue, collection, query_id, query): namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) if records_total: queue.progress.mark_pending(records_total) aggregator = get_aggregator(collection) writer = aggregator.bulk() entities_count = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) entities_count += 1 fragment = '%s-%s' % (query_id, idx) writer.put(entity, fragment=fragment) if idx > 0 and idx % 1000 == 0: queue.progress.mark_finished(1000) log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, idx, records_total or 'streaming', entities_count) writer.flush() aggregator.close() log.info("[%s] Query done (%s entities)", collection.foreign_id, entities_count)
def bulk_write(collection, items, merge=True): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def upgrade(): bind = op.get_bind() meta = sa.MetaData() meta.bind = bind meta.reflect() entity_table = meta.tables["entity"] collection_table = meta.tables["collection"] q = sa.select([collection_table]) crp = bind.execute(q) for collection in crp.fetchall(): ns = Namespace(collection.foreign_id) q = sa.select([entity_table]) q = q.where(entity_table.c.collection_id == collection.id) erp = bind.execute(q) while True: entity = erp.fetchone() if not entity: break proxy = model.get_proxy( { "id": entity.id, "schema": entity.schema, "properties": entity.data }, cleaned=False, ) proxy.add("name", entity.name, quiet=True, cleaned=False) proxy = ns.apply(proxy) q = sa.update(entity_table) q = q.where(entity_table.c.id == entity.id) q = q.values(id=proxy.id, data=proxy.properties) bind.execute(q) op.drop_column("entity", "foreign_id") op.drop_column("entity", "name")
def __init__(self, dataset, stage, context): self.dataset = dataset self.writer = dataset.bulk() self.stage = stage self.context = context self.ns = Namespace(self.context.get("namespace")) self.work_path = ensure_path(mkdtemp(prefix="ingestor-")) self.emitted = set()
def sign(infile, outfile, signature): ns = Namespace(signature) try: for entity in read_entities(infile): signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def sign(infile: Path, outfile: Path, signature: Optional[str]) -> None: ns = Namespace(signature) try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): signed = ns.apply(entity) write_entity(outfh, signed) except BrokenPipeError: raise click.Abort()
def test_apply(self): entity = { "id": "banana", "schema": "LegalEntity", "properties": {"sameAs": ["kumkwat"], "parent": ["pretzel"]}, } proxy = model.get_proxy(entity) assert proxy.id == "banana", proxy.id ns = Namespace("fruit") out = ns.apply(proxy) assert out.id == ns.sign(proxy.id), out
def sign(infile, outfile, signature): ns = Namespace(signature) try: while True: entity = read_entity(infile) if entity is None: break signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def test_apply(self): entity = { 'id': 'banana', 'schema': 'LegalEntity', 'properties': { 'sameAs': ['kumkwat'], 'parent': ['pretzel'] } } proxy = model.get_proxy(entity) assert proxy.id == 'banana', proxy.id ns = Namespace('fruit') out = ns.apply(proxy) assert out.id == ns.sign(proxy.id), out
def aggregate(infile, outfile): buffer = {} namespace = Namespace(None) try: for entity in read_entities(infile): entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def run_mapping(outfile, mapping_yaml, sign=True): config = load_mapping_file(mapping_yaml) try: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: config = load_mapping_file(mapping_yaml) try: with path_writer(outfile) as outfh: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def aggregate(infile: Path, outfile: Path) -> None: buffer: Dict[str, EntityProxy] = {} namespace = Namespace(None) try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def bulk_write(collection, iterable, job_id=None, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) stage = get_stage(collection, OP_INDEX, job_id=job_id) entities = [] for item in iterable: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) if not unsafe: entity = remove_checksums(entity) entities.append(entity) index_entities(stage, collection, entities)
def delete_aggregator_entity(collection, entity_id): aggregator = get_aggregator(collection) try: entity_id = collection.ns.sign(entity_id) aggregator.delete(entity_id=entity_id) base_id, _ = Namespace.parse(entity_id) aggregator.delete(entity_id=base_id) finally: aggregator.close()
def add(self, subject, canonical): subject, _ = Namespace.parse(get_entity_id(subject)) canonical, _ = Namespace.parse(get_entity_id(canonical)) # Don't do no-ops. if subject == canonical: return if subject is None or canonical is None: return cluster = Cluster(canonical, subject) cluster = self.clusters.get(canonical, cluster) if subject in self.clusters: previous = self.clusters.get(subject) cluster.update(previous.entities) for entity in cluster.entities: self.clusters[entity] = cluster
def by_id(cls, document_id, collection=None): try: document_id = int(Namespace.strip(document_id)) except Exception: return q = cls.all() q = q.filter(cls.id == document_id) if collection is not None: q = q.filter(cls.collection_id == collection.id) return q.first()
def aggregate(): buffer = {} namespace = Namespace(None) try: stdin = click.get_text_stream('stdin') while True: entity = read_entity(stdin) if entity is None: break entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity stdout = click.get_text_stream('stdout') for entity in buffer.values(): write_object(stdout, entity) except BrokenPipeError: raise click.Abort()
def by_id(cls, document_id, collection_id=None): try: document_id, _ = Namespace.parse(document_id) document_id = int(document_id) except Exception: return q = cls.all() q = q.filter(cls.id == document_id) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) return q.first()
def stream_mapping(infile, outfile, signature, mapping_yaml): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: ns = Namespace(signature) for record in StreamSource.read_csv(infile): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): if signature is not None: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def test_sign(self): ns = Namespace('banana') x = ns.sign('split') assert x.startswith('split'), x assert ns.sign(None) is None assert x.endswith(ns.signature('split')) assert ns.signature(None) is None
def test_sign(self): ns = Namespace("banana") x = ns.sign("split") assert x.startswith("split"), x assert ns.sign(None) is None assert x.endswith(ns.signature("split")) assert ns.signature(None) is None
def test_verify(self): ns = Namespace("banana") x = ns.sign("split") assert Namespace.SEP in x assert ns.verify(x) assert not ns.verify("split") assert not ns.verify(None)
def test_verify(self): ns = Namespace('banana') x = ns.sign('split') assert Namespace.SEP in x assert ns.verify(x) assert not ns.verify('split') assert not ns.verify(None)
def save(cls, session, subject, candidate, score=None, judgement=None, priority=None): obj = cls.by_id(session, subject, candidate) if obj is None: obj = cls() obj.id = cls.make_id(subject, candidate) obj.subject, _ = Namespace.parse(get_entity_id(subject)) obj.candidate, _ = Namespace.parse(get_entity_id(candidate)) priority = priority or DEFAULT_PRIORITY if score is not None: obj.score = score obj.priority = score * priority if judgement is not None: obj.judgement = judgement obj.updated_at = now() session.add(obj) return obj
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def bulk_write(collection, items, merge=True, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) if not unsafe: entity = namespace.apply(entity) entity = remove_checksums(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def ns(self): if not hasattr(self, '_ns'): self._ns = Namespace(self.foreign_id) return self._ns