Esempio n. 1
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, records_index, records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
Esempio n. 2
0
 def __init__(self, dataset, entity, context):
     self.dataset = dataset
     self.ns = Namespace(context.get("namespace", dataset.name))
     self.entity = model.make_entity(entity.schema)
     self.entity.id = entity.id
     self.aggregator_entities = TagAggregatorFasttext()
     self.aggregator_patterns = TagAggregator()
Esempio n. 3
0
def stream_mapping(infile: Path,
                   outfile: Path,
                   mapping_yaml: Path,
                   sign: bool = True) -> None:
    queries: List[Tuple[str, QueryMapping]] = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, "queries", "query"):
            data.pop("database", None)
            data["csv_url"] = "/dev/null"
            query = model.make_mapping(data, key_prefix=dataset)
            queries.append((dataset, query))

    try:
        with path_writer(outfile) as outfh:
            with input_file(infile) as fh:
                for record in CSVSource.read_csv(fh):
                    for (dataset, query) in queries:
                        ns = Namespace(dataset)
                        if query.source.check_filters(record):  # type: ignore
                            entities = query.map(record)
                            for entity in entities.values():
                                if sign:
                                    entity = ns.apply(entity)
                                write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 4
0
def upgrade():
    bind = op.get_bind()
    meta = sa.MetaData()
    meta.bind = bind
    meta.reflect()
    entity_table = meta.tables['entity']
    collection_table = meta.tables['collection']
    q = sa.select([collection_table])
    crp = bind.execute(q)
    for collection in crp.fetchall():
        ns = Namespace(collection.foreign_id)
        q = sa.select([entity_table])
        q = q.where(entity_table.c.collection_id == collection.id)
        erp = bind.execute(q)
        while True:
            entity = erp.fetchone()
            if not entity:
                break
            proxy = model.get_proxy(
                {
                    'id': entity.id,
                    'schema': entity.schema,
                    'properties': entity.data
                },
                cleaned=False)
            proxy.add('name', entity.name, quiet=True, cleaned=False)
            proxy = ns.apply(proxy)
            q = sa.update(entity_table)
            q = q.where(entity_table.c.id == entity.id)
            q = q.values(id=proxy.id, data=proxy.properties)
            bind.execute(q)

    op.drop_column('entity', 'foreign_id')
    op.drop_column('entity', 'name')
Esempio n. 5
0
def bulk_load_query(queue, collection, query_id, query):
    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source)
    if records_total:
        queue.progress.mark_pending(records_total)
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entities_count = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            entities_count += 1
            fragment = '%s-%s' % (query_id, idx)
            writer.put(entity, fragment=fragment)

        if idx > 0 and idx % 1000 == 0:
            queue.progress.mark_finished(1000)
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, idx, records_total or 'streaming',
                     entities_count)
    writer.flush()
    aggregator.close()
    log.info("[%s] Query done (%s entities)", collection.foreign_id,
             entities_count)
Esempio n. 6
0
def bulk_write(collection, items, merge=True):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Esempio n. 7
0
def upgrade():
    bind = op.get_bind()
    meta = sa.MetaData()
    meta.bind = bind
    meta.reflect()
    entity_table = meta.tables["entity"]
    collection_table = meta.tables["collection"]
    q = sa.select([collection_table])
    crp = bind.execute(q)
    for collection in crp.fetchall():
        ns = Namespace(collection.foreign_id)
        q = sa.select([entity_table])
        q = q.where(entity_table.c.collection_id == collection.id)
        erp = bind.execute(q)
        while True:
            entity = erp.fetchone()
            if not entity:
                break
            proxy = model.get_proxy(
                {
                    "id": entity.id,
                    "schema": entity.schema,
                    "properties": entity.data
                },
                cleaned=False,
            )
            proxy.add("name", entity.name, quiet=True, cleaned=False)
            proxy = ns.apply(proxy)
            q = sa.update(entity_table)
            q = q.where(entity_table.c.id == entity.id)
            q = q.values(id=proxy.id, data=proxy.properties)
            bind.execute(q)

    op.drop_column("entity", "foreign_id")
    op.drop_column("entity", "name")
Esempio n. 8
0
 def __init__(self, dataset, stage, context):
     self.dataset = dataset
     self.writer = dataset.bulk()
     self.stage = stage
     self.context = context
     self.ns = Namespace(self.context.get("namespace"))
     self.work_path = ensure_path(mkdtemp(prefix="ingestor-"))
     self.emitted = set()
Esempio n. 9
0
def sign(infile, outfile, signature):
    ns = Namespace(signature)
    try:
        for entity in read_entities(infile):
            signed = ns.apply(entity)
            write_object(outfile, signed)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 10
0
def sign(infile: Path, outfile: Path, signature: Optional[str]) -> None:
    ns = Namespace(signature)
    try:
        with path_writer(outfile) as outfh:
            for entity in path_entities(infile, EntityProxy):
                signed = ns.apply(entity)
                write_entity(outfh, signed)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 11
0
 def test_apply(self):
     entity = {
         "id": "banana",
         "schema": "LegalEntity",
         "properties": {"sameAs": ["kumkwat"], "parent": ["pretzel"]},
     }
     proxy = model.get_proxy(entity)
     assert proxy.id == "banana", proxy.id
     ns = Namespace("fruit")
     out = ns.apply(proxy)
     assert out.id == ns.sign(proxy.id), out
Esempio n. 12
0
def sign(infile, outfile, signature):
    ns = Namespace(signature)
    try:
        while True:
            entity = read_entity(infile)
            if entity is None:
                break
            signed = ns.apply(entity)
            write_object(outfile, signed)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 13
0
 def test_apply(self):
     entity = {
         'id': 'banana',
         'schema': 'LegalEntity',
         'properties': {
             'sameAs': ['kumkwat'],
             'parent': ['pretzel']
         }
     }
     proxy = model.get_proxy(entity)
     assert proxy.id == 'banana', proxy.id
     ns = Namespace('fruit')
     out = ns.apply(proxy)
     assert out.id == ns.sign(proxy.id), out
Esempio n. 14
0
def aggregate(infile, outfile):
    buffer = {}
    namespace = Namespace(None)
    try:
        for entity in read_entities(infile):
            entity = namespace.apply(entity)
            if entity.id in buffer:
                buffer[entity.id].merge(entity)
            else:
                buffer[entity.id] = entity

        for entity in buffer.values():
            write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 15
0
def run_mapping(outfile, mapping_yaml, sign=True):
    config = load_mapping_file(mapping_yaml)
    try:
        for dataset, meta in config.items():
            ns = Namespace(dataset)
            for mapping in keys_values(meta, "queries", "query"):
                entities = model.map_entities(mapping, key_prefix=dataset)
                for entity in entities:
                    if sign:
                        entity = ns.apply(entity)
                    write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
Esempio n. 16
0
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None:
    config = load_mapping_file(mapping_yaml)
    try:
        with path_writer(outfile) as outfh:
            for dataset, meta in config.items():
                ns = Namespace(dataset)
                for mapping in keys_values(meta, "queries", "query"):
                    entities = model.map_entities(mapping, key_prefix=dataset)
                    for entity in entities:
                        if sign:
                            entity = ns.apply(entity)
                        write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
Esempio n. 17
0
def aggregate(infile: Path, outfile: Path) -> None:
    buffer: Dict[str, EntityProxy] = {}
    namespace = Namespace(None)
    try:
        with path_writer(outfile) as outfh:
            for entity in path_entities(infile, EntityProxy):
                entity = namespace.apply(entity)
                if entity.id in buffer:
                    buffer[entity.id].merge(entity)
                else:
                    buffer[entity.id] = entity

            for entity in buffer.values():
                write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 18
0
def bulk_write(collection, iterable, job_id=None, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    stage = get_stage(collection, OP_INDEX, job_id=job_id)
    entities = []
    for item in iterable:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)
        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entities.append(entity)
    index_entities(stage, collection, entities)
Esempio n. 19
0
def delete_aggregator_entity(collection, entity_id):
    aggregator = get_aggregator(collection)
    try:
        entity_id = collection.ns.sign(entity_id)
        aggregator.delete(entity_id=entity_id)
        base_id, _ = Namespace.parse(entity_id)
        aggregator.delete(entity_id=base_id)
    finally:
        aggregator.close()
Esempio n. 20
0
    def add(self, subject, canonical):
        subject, _ = Namespace.parse(get_entity_id(subject))
        canonical, _ = Namespace.parse(get_entity_id(canonical))

        # Don't do no-ops.
        if subject == canonical:
            return
        if subject is None or canonical is None:
            return

        cluster = Cluster(canonical, subject)
        cluster = self.clusters.get(canonical, cluster)
        if subject in self.clusters:
            previous = self.clusters.get(subject)
            cluster.update(previous.entities)

        for entity in cluster.entities:
            self.clusters[entity] = cluster
Esempio n. 21
0
 def by_id(cls, document_id, collection=None):
     try:
         document_id = int(Namespace.strip(document_id))
     except Exception:
         return
     q = cls.all()
     q = q.filter(cls.id == document_id)
     if collection is not None:
         q = q.filter(cls.collection_id == collection.id)
     return q.first()
Esempio n. 22
0
def aggregate():
    buffer = {}
    namespace = Namespace(None)
    try:
        stdin = click.get_text_stream('stdin')
        while True:
            entity = read_entity(stdin)
            if entity is None:
                break
            entity = namespace.apply(entity)
            if entity.id in buffer:
                buffer[entity.id].merge(entity)
            else:
                buffer[entity.id] = entity

        stdout = click.get_text_stream('stdout')
        for entity in buffer.values():
            write_object(stdout, entity)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 23
0
 def by_id(cls, document_id, collection_id=None):
     try:
         document_id, _ = Namespace.parse(document_id)
         document_id = int(document_id)
     except Exception:
         return
     q = cls.all()
     q = q.filter(cls.id == document_id)
     if collection_id is not None:
         q = q.filter(cls.collection_id == collection_id)
     return q.first()
Esempio n. 24
0
def stream_mapping(infile, outfile, signature, mapping_yaml):
    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, 'queries', 'query'):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append(source)

    try:
        ns = Namespace(signature)
        for record in StreamSource.read_csv(infile):
            for source in sources:
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        if signature is not None:
                            entity = ns.apply(entity)
                        write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
Esempio n. 25
0
 def test_sign(self):
     ns = Namespace('banana')
     x = ns.sign('split')
     assert x.startswith('split'), x
     assert ns.sign(None) is None
     assert x.endswith(ns.signature('split'))
     assert ns.signature(None) is None
Esempio n. 26
0
 def test_sign(self):
     ns = Namespace("banana")
     x = ns.sign("split")
     assert x.startswith("split"), x
     assert ns.sign(None) is None
     assert x.endswith(ns.signature("split"))
     assert ns.signature(None) is None
Esempio n. 27
0
 def test_verify(self):
     ns = Namespace("banana")
     x = ns.sign("split")
     assert Namespace.SEP in x
     assert ns.verify(x)
     assert not ns.verify("split")
     assert not ns.verify(None)
Esempio n. 28
0
 def test_verify(self):
     ns = Namespace('banana')
     x = ns.sign('split')
     assert Namespace.SEP in x
     assert ns.verify(x)
     assert not ns.verify('split')
     assert not ns.verify(None)
Esempio n. 29
0
 def save(cls,
          session,
          subject,
          candidate,
          score=None,
          judgement=None,
          priority=None):
     obj = cls.by_id(session, subject, candidate)
     if obj is None:
         obj = cls()
         obj.id = cls.make_id(subject, candidate)
         obj.subject, _ = Namespace.parse(get_entity_id(subject))
         obj.candidate, _ = Namespace.parse(get_entity_id(candidate))
     priority = priority or DEFAULT_PRIORITY
     if score is not None:
         obj.score = score
         obj.priority = score * priority
     if judgement is not None:
         obj.judgement = judgement
     obj.updated_at = now()
     session.add(obj)
     return obj
Esempio n. 30
0
File: bulk.py Progetto: pudo/aleph
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id,
                     records_index,
                     records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
Esempio n. 31
0
File: bulk.py Progetto: pudo/aleph
def bulk_write(collection, items, merge=True, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        if not unsafe:
            entity = namespace.apply(entity)
            entity = remove_checksums(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Esempio n. 32
0
 def ns(self):
     if not hasattr(self, '_ns'):
         self._ns = Namespace(self.foreign_id)
     return self._ns