Beispiel #1
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, records_index, records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
def upgrade():
    bind = op.get_bind()
    meta = sa.MetaData()
    meta.bind = bind
    meta.reflect()
    entity_table = meta.tables['entity']
    collection_table = meta.tables['collection']
    q = sa.select([collection_table])
    crp = bind.execute(q)
    for collection in crp.fetchall():
        ns = Namespace(collection.foreign_id)
        q = sa.select([entity_table])
        q = q.where(entity_table.c.collection_id == collection.id)
        erp = bind.execute(q)
        while True:
            entity = erp.fetchone()
            if not entity:
                break
            proxy = model.get_proxy(
                {
                    'id': entity.id,
                    'schema': entity.schema,
                    'properties': entity.data
                },
                cleaned=False)
            proxy.add('name', entity.name, quiet=True, cleaned=False)
            proxy = ns.apply(proxy)
            q = sa.update(entity_table)
            q = q.where(entity_table.c.id == entity.id)
            q = q.values(id=proxy.id, data=proxy.properties)
            bind.execute(q)

    op.drop_column('entity', 'foreign_id')
    op.drop_column('entity', 'name')
def upgrade():
    bind = op.get_bind()
    meta = sa.MetaData()
    meta.bind = bind
    meta.reflect()
    entity_table = meta.tables["entity"]
    collection_table = meta.tables["collection"]
    q = sa.select([collection_table])
    crp = bind.execute(q)
    for collection in crp.fetchall():
        ns = Namespace(collection.foreign_id)
        q = sa.select([entity_table])
        q = q.where(entity_table.c.collection_id == collection.id)
        erp = bind.execute(q)
        while True:
            entity = erp.fetchone()
            if not entity:
                break
            proxy = model.get_proxy(
                {
                    "id": entity.id,
                    "schema": entity.schema,
                    "properties": entity.data
                },
                cleaned=False,
            )
            proxy.add("name", entity.name, quiet=True, cleaned=False)
            proxy = ns.apply(proxy)
            q = sa.update(entity_table)
            q = q.where(entity_table.c.id == entity.id)
            q = q.values(id=proxy.id, data=proxy.properties)
            bind.execute(q)

    op.drop_column("entity", "foreign_id")
    op.drop_column("entity", "name")
Beispiel #4
0
def stream_mapping(infile: Path,
                   outfile: Path,
                   mapping_yaml: Path,
                   sign: bool = True) -> None:
    queries: List[Tuple[str, QueryMapping]] = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, "queries", "query"):
            data.pop("database", None)
            data["csv_url"] = "/dev/null"
            query = model.make_mapping(data, key_prefix=dataset)
            queries.append((dataset, query))

    try:
        with path_writer(outfile) as outfh:
            with input_file(infile) as fh:
                for record in CSVSource.read_csv(fh):
                    for (dataset, query) in queries:
                        ns = Namespace(dataset)
                        if query.source.check_filters(record):  # type: ignore
                            entities = query.map(record)
                            for entity in entities.values():
                                if sign:
                                    entity = ns.apply(entity)
                                write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #5
0
def bulk_load_query(queue, collection, query_id, query):
    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source)
    if records_total:
        queue.progress.mark_pending(records_total)
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entities_count = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            entities_count += 1
            fragment = '%s-%s' % (query_id, idx)
            writer.put(entity, fragment=fragment)

        if idx > 0 and idx % 1000 == 0:
            queue.progress.mark_finished(1000)
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, idx, records_total or 'streaming',
                     entities_count)
    writer.flush()
    aggregator.close()
    log.info("[%s] Query done (%s entities)", collection.foreign_id,
             entities_count)
Beispiel #6
0
def bulk_write(collection, items, merge=True):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Beispiel #7
0
def sign(infile, outfile, signature):
    ns = Namespace(signature)
    try:
        for entity in read_entities(infile):
            signed = ns.apply(entity)
            write_object(outfile, signed)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #8
0
def sign(infile: Path, outfile: Path, signature: Optional[str]) -> None:
    ns = Namespace(signature)
    try:
        with path_writer(outfile) as outfh:
            for entity in path_entities(infile, EntityProxy):
                signed = ns.apply(entity)
                write_entity(outfh, signed)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #9
0
 def test_apply(self):
     entity = {
         "id": "banana",
         "schema": "LegalEntity",
         "properties": {"sameAs": ["kumkwat"], "parent": ["pretzel"]},
     }
     proxy = model.get_proxy(entity)
     assert proxy.id == "banana", proxy.id
     ns = Namespace("fruit")
     out = ns.apply(proxy)
     assert out.id == ns.sign(proxy.id), out
Beispiel #10
0
def sign(infile, outfile, signature):
    ns = Namespace(signature)
    try:
        while True:
            entity = read_entity(infile)
            if entity is None:
                break
            signed = ns.apply(entity)
            write_object(outfile, signed)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #11
0
 def test_apply(self):
     entity = {
         'id': 'banana',
         'schema': 'LegalEntity',
         'properties': {
             'sameAs': ['kumkwat'],
             'parent': ['pretzel']
         }
     }
     proxy = model.get_proxy(entity)
     assert proxy.id == 'banana', proxy.id
     ns = Namespace('fruit')
     out = ns.apply(proxy)
     assert out.id == ns.sign(proxy.id), out
Beispiel #12
0
def run_mapping(outfile, mapping_yaml, sign=True):
    config = load_mapping_file(mapping_yaml)
    try:
        for dataset, meta in config.items():
            ns = Namespace(dataset)
            for mapping in keys_values(meta, "queries", "query"):
                entities = model.map_entities(mapping, key_prefix=dataset)
                for entity in entities:
                    if sign:
                        entity = ns.apply(entity)
                    write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
def aggregate(infile, outfile):
    buffer = {}
    namespace = Namespace(None)
    try:
        for entity in read_entities(infile):
            entity = namespace.apply(entity)
            if entity.id in buffer:
                buffer[entity.id].merge(entity)
            else:
                buffer[entity.id] = entity

        for entity in buffer.values():
            write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #14
0
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None:
    config = load_mapping_file(mapping_yaml)
    try:
        with path_writer(outfile) as outfh:
            for dataset, meta in config.items():
                ns = Namespace(dataset)
                for mapping in keys_values(meta, "queries", "query"):
                    entities = model.map_entities(mapping, key_prefix=dataset)
                    for entity in entities:
                        if sign:
                            entity = ns.apply(entity)
                        write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
    except Exception as exc:
        raise click.ClickException(str(exc))
Beispiel #15
0
def aggregate(infile: Path, outfile: Path) -> None:
    buffer: Dict[str, EntityProxy] = {}
    namespace = Namespace(None)
    try:
        with path_writer(outfile) as outfh:
            for entity in path_entities(infile, EntityProxy):
                entity = namespace.apply(entity)
                if entity.id in buffer:
                    buffer[entity.id].merge(entity)
                else:
                    buffer[entity.id] = entity

            for entity in buffer.values():
                write_entity(outfh, entity)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #16
0
def bulk_write(collection, iterable, job_id=None, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    stage = get_stage(collection, OP_INDEX, job_id=job_id)
    entities = []
    for item in iterable:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)
        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entities.append(entity)
    index_entities(stage, collection, entities)
Beispiel #17
0
def aggregate():
    buffer = {}
    namespace = Namespace(None)
    try:
        stdin = click.get_text_stream('stdin')
        while True:
            entity = read_entity(stdin)
            if entity is None:
                break
            entity = namespace.apply(entity)
            if entity.id in buffer:
                buffer[entity.id].merge(entity)
            else:
                buffer[entity.id] = entity

        stdout = click.get_text_stream('stdout')
        for entity in buffer.values():
            write_object(stdout, entity)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #18
0
def stream_mapping(infile, outfile, signature, mapping_yaml):
    sources = []
    config = load_mapping_file(mapping_yaml)
    for dataset, meta in config.items():
        for data in keys_values(meta, 'queries', 'query'):
            query = model.make_mapping(data, key_prefix=dataset)
            source = StreamSource(query, data)
            sources.append(source)

    try:
        ns = Namespace(signature)
        for record in StreamSource.read_csv(infile):
            for source in sources:
                if source.check_filters(record):
                    entities = source.query.map(record)
                    for entity in entities.values():
                        if signature is not None:
                            entity = ns.apply(entity)
                        write_object(outfile, entity)
    except BrokenPipeError:
        raise click.Abort()
Beispiel #19
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id,
                     records_index,
                     records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
Beispiel #20
0
def bulk_write(collection, items, merge=True, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        if not unsafe:
            entity = namespace.apply(entity)
            entity = remove_checksums(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Beispiel #21
0
class Manager(object):
    """Handles the lifecycle of an ingestor. This can be subclassed to embed it
    into a larger processing framework."""

    #: Indicates that during the processing no errors or failures occured.
    STATUS_SUCCESS = u"success"
    #: Indicates occurance of errors during the processing.
    STATUS_FAILURE = u"failure"

    MAGIC = magic.Magic(mime=True)

    def __init__(self, dataset, stage, context):
        self.dataset = dataset
        self.writer = dataset.bulk()
        self.stage = stage
        self.context = context
        self.ns = Namespace(self.context.get("namespace"))
        self.work_path = ensure_path(mkdtemp(prefix="ingestor-"))
        self.emitted = set()

    @property
    def archive(self):
        if not hasattr(settings, "_archive"):
            settings._archive = init_archive()
        return settings._archive

    def make_entity(self, schema, parent=None):
        schema = model.get(schema)
        entity = model.make_entity(schema,
                                   key_prefix=self.stage.job.dataset.name)
        self.make_child(parent, entity)
        return entity

    def make_child(self, parent, child):
        """Derive entity properties by knowing it's parent folder."""
        if parent is not None and child is not None:
            # Folder hierarchy:
            child.add("parent", parent.id)
            child.add("ancestors", parent.get("ancestors"))
            child.add("ancestors", parent.id)
            self.apply_context(child, parent)

    def apply_context(self, entity, source):
        # Aleph-specific context data:
        entity.context = {
            "created_at": source.context.get("created_at"),
            "updated_at": source.context.get("updated_at"),
            "role_id": source.context.get("role_id"),
            "mutable": False,
        }

    def emit_entity(self, entity, fragment=None):
        entity = self.ns.apply(entity)
        # pprint(entity.to_dict())
        self.writer.put(entity.to_dict(), fragment)
        self.emitted.add(entity.id)

    def emit_text_fragment(self, entity, texts, fragment):
        texts = [t for t in ensure_list(texts) if filter_text(t)]
        if len(texts):
            doc = self.make_entity(entity.schema)
            doc.id = entity.id
            doc.add("indexText", texts)
            self.emit_entity(doc, fragment=safe_fragment(fragment))

    def auction(self, file_path, entity):
        if not entity.has("mimeType"):
            if file_path.is_dir():
                entity.add("mimeType", DirectoryIngestor.MIME_TYPE)
                return DirectoryIngestor
            entity.add("mimeType", self.MAGIC.from_file(file_path.as_posix()))

        best_score, best_cls = 0, None
        for cls in get_extensions("ingestors"):
            score = cls.match(file_path, entity)
            if score > best_score:
                best_score = score
                best_cls = cls

        if best_cls is None:
            raise ProcessingException("Format not supported")
        return best_cls

    def queue_entity(self, entity):
        log.debug("Queue: %r", entity)
        self.stage.queue(entity.to_dict(), self.context)

    def store(self, file_path, mime_type=None):
        file_path = ensure_path(file_path)
        mime_type = normalize_mimetype(mime_type)
        if file_path is not None and file_path.is_file():
            return self.archive.archive_file(file_path, mime_type=mime_type)

    def load(self, content_hash, file_name=None):
        # log.info("Local archive name: %s", file_name)
        return self.archive.load_file(content_hash,
                                      file_name=file_name,
                                      temp_path=self.work_path)

    def ingest_entity(self, entity):
        for content_hash in entity.get("contentHash", quiet=True):
            file_name = entity_filename(entity)
            file_path = self.load(content_hash, file_name=file_name)
            if file_path is None or not file_path.exists():
                continue
            self.ingest(file_path, entity)
            return
        self.finalize(entity)

    def ingest(self, file_path, entity, **kwargs):
        """Main execution step of an ingestor."""
        file_path = ensure_path(file_path)
        if file_path.is_file() and not entity.has("fileSize"):
            entity.add("fileSize", file_path.stat().st_size)

        entity.set("processingStatus", self.STATUS_FAILURE)
        try:
            ingestor_class = self.auction(file_path, entity)
            log.info("Ingestor [%r]: %s", entity, ingestor_class.__name__)
            self.delegate(ingestor_class, file_path, entity)
            entity.set("processingStatus", self.STATUS_SUCCESS)
        except ProcessingException as pexc:
            entity.set("processingError", stringify(pexc))
            log.error("[%r] Failed to process: %s", entity, pexc)
        finally:
            self.finalize(entity)

    def finalize(self, entity):
        self.emit_entity(entity)
        self.writer.flush()
        remove_directory(self.work_path)

    def delegate(self, ingestor_class, file_path, entity):
        ingestor_class(self).ingest(file_path, entity)

    def close(self):
        self.writer.flush()
        remove_directory(self.work_path)
Beispiel #22
0
class Analyzer(object):
    MENTIONS = {TAG_COMPANY: "Organization", TAG_PERSON: "Person"}

    def __init__(self, dataset, entity, context):
        self.dataset = dataset
        self.ns = Namespace(context.get("namespace", dataset.name))
        self.entity = model.make_entity(entity.schema)
        self.entity.id = entity.id
        self.aggregator_entities = TagAggregatorFasttext()
        self.aggregator_patterns = TagAggregator()

    def feed(self, entity):
        if not settings.ANALYZE_ENTITIES:
            return
        if not entity.schema.is_a(ANALYZABLE):
            return
        # HACK: Tables should be mapped, don't try to tag them here.
        if entity.schema.is_a("Table"):
            return

        texts = entity.get_type_values(registry.text)
        for text in text_chunks(texts):
            detect_languages(self.entity, text)
            for (prop, tag) in extract_entities(self.entity, text):
                self.aggregator_entities.add(prop, tag)
            for (prop, tag) in extract_patterns(self.entity, text):
                self.aggregator_patterns.add(prop, tag)

    def flush(self):
        writer = self.dataset.bulk()
        countries = set()
        results = list(
            chain(self.aggregator_entities.results(),
                  self.aggregator_patterns.results()))

        for (key, prop, values) in results:
            if prop.type == registry.country:
                countries.add(key)

        mention_ids = set()
        for (key, prop, values) in results:
            label = values[0]
            if prop.type == registry.name:
                label = registry.name.pick(values)

            schema = self.MENTIONS.get(prop)
            if schema is not None and self.entity.schema.is_a(DOCUMENT):
                mention = model.make_entity("Mention")
                mention.make_id("mention", self.entity.id, prop, key)
                mention_ids.add(mention.id)
                mention.add("resolved", make_entity_id(key))
                mention.add("document", self.entity.id)
                mention.add("name", values)
                mention.add("detectedSchema", schema)
                mention.add("contextCountry", countries)
                mention = self.ns.apply(mention)
                writer.put(mention)
                # pprint(mention.to_dict())

            self.entity.add(prop, label, cleaned=True, quiet=True)

        if len(results):
            log.debug(
                "Extracted %d prop values, %d mentions [%s]: %s",
                len(results),
                len(mention_ids),
                self.entity.schema.name,
                self.entity.id,
            )
            writer.put(self.entity)
            writer.flush()

        return mention_ids