Example #1
0
def sample_entities(secret, properties, schematas, seed, sample_pct, limit,
                    outfile):
    """Sample random entities"""
    random.seed(seed)
    authz = Authz.from_role(Role.load_cli_user())
    collections = list(Collection.all_by_secret(secret, authz))
    random.shuffle(collections)
    iter_proxies_kwargs = {
        "authz": authz,
        "schemata": schematas or None,
        "randomize": True,
        "random_seed": seed,
    }
    n_entities = 0
    for collection in collections:
        for entity in iter_proxies(collection_id=collection.id,
                                   **iter_proxies_kwargs):
            if properties and not any(
                    entity.properties.get(prop) for prop in properties):
                continue
            if not sample_pct or random.random() < sample_pct:
                write_object(outfile, entity)
                n_entities += 1
                if limit and n_entities >= limit:
                    return
Example #2
0
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("query-export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_path = export_dir.joinpath(EXCEL_FILE)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for entity in iter_proxies(filters=filters):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= Export.MAX_FILE_SIZE:
                    log.warn("Export too large: %r", export)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=EXCEL_FILE)
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Example #3
0
def xref_collection(collection_id):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_proxies(collection_id=collection_id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        entity_id, document_id = None, None
        if Document.SCHEMA in proxy.schema.names:
            document_id = proxy.id
        else:
            entity_id = proxy.id

        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == entity_id)
        dq = dq.filter(Match.document_id == document_id)
        dq.delete()
        matches = xref_item(proxy)
        for (score, other_id, other) in matches:
            log.info("Xref [%.1f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = entity_id
            obj.document_id = document_id
            obj.collection_id = collection_id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()
Example #4
0
def dump_entities(foreign_id, outfile):
    """Export FtM entities for the given collection."""
    collection = get_collection(foreign_id)
    for entity in iter_proxies(collection_id=collection.id,
                               excludes=['text']):
        entity.context = {}
        write_object(outfile, entity)
Example #5
0
def export_collection(collection):
    uri = URIRef(ui_url('collections', collection.id))
    g = Graph()
    g.add((uri, RDF.type, DCMI.Collection))
    g.add((uri, RDFS.label, Literal(collection.label)))
    g.add((uri, DCMI.identifier, Literal(collection.foreign_id)))
    g.add((uri, ALEPH.category, ALEPH[collection.category]))
    yield from itergraph(g)
    for entity in iter_proxies(collection_id=collection.id):
        yield from itergraph(export_entity(entity, uri))
Example #6
0
File: rdf.py Project: pudo/aleph
def export_collection(collection):
    uri = URIRef(ui_url('collections', collection.id))
    g = Graph()
    g.add((uri, RDF.type, DCMI.Collection))
    g.add((uri, RDFS.label, Literal(collection.label)))
    g.add((uri, DCMI.identifier, Literal(collection.foreign_id)))
    g.add((uri, ALEPH.category, ALEPH[collection.category]))
    yield from itergraph(g)
    for entity in iter_proxies(collection_id=collection.id):
        yield from itergraph(export_entity(entity, uri))
Example #7
0
def _iter_mentions(collection):
    """Combine mentions into pseudo-entities used for xref."""
    proxy = model.make_entity(Entity.LEGAL_ENTITY)
    for mention in iter_proxies(
        collection_id=collection.id,
        schemata=["Mention"],
        sort={"properties.resolved": "desc"},
    ):
        if mention.first("resolved") != proxy.id:
            if proxy.id is not None:
                yield proxy
            proxy = model.make_entity(Entity.LEGAL_ENTITY)
            proxy.id = mention.first("resolved")
        _merge_schemata(proxy, mention.get("detectedSchema"))
        proxy.add("name", mention.get("name"))
        proxy.add("country", mention.get("contextCountry"))
    if proxy.id is not None:
        yield proxy
Example #8
0
File: export.py Project: sunu/aleph
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_name = safe_filename(export.label, extension="xlsx")
            excel_path = export_dir.joinpath(excel_name)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for idx, entity in enumerate(iter_proxies(filters=filters)):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE:
                    concern = "total size of the"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break
                if idx >= settings.EXPORT_MAX_RESULTS:
                    concern = "number of"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=excel_name)
        file_name = "Export: %s" % export.label
        file_name = safe_filename(file_name, extension="zip")
        complete_export(export_id, file_path, file_name)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Example #9
0
File: xref.py Project: pudo/aleph
def xref_collection(collection_id, against_collection_ids=None):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_proxies(collection_id=collection_id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == proxy.id)
        dq.delete()
        matches = xref_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection_id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()
Example #10
0
def xref_collection(queue, collection, against_collection_ids=None):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    count = count_entities(collection_id=collection.id, schemata=matchable)
    queue.progress.mark_pending(count)
    entities = iter_proxies(collection_id=collection.id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == proxy.id)
        dq.delete()
        matches = xref_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection.id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()
        queue.progress.mark_finished()
    queue.remove()
Example #11
0
def _query_entities(collection):
    """Generate matches for indexing."""
    log.info("[%s] Generating entity-based xref...", collection)
    matchable = [s for s in model if s.matchable]
    for proxy in iter_proxies(collection_id=collection.id, schemata=matchable):
        yield from _query_item(proxy)
Example #12
0
def dump_entities(foreign_id, outfile):
    """Export FtM entities for the given collection."""
    collection = get_collection(foreign_id)
    for entity in iter_proxies(collection_id=collection.id):
        write_object(outfile, entity)
Example #13
0
def _query_entities(collection):
    """Generate matches for indexing."""
    matchable = [s.name for s in model if s.matchable]
    for proxy in iter_proxies(collection_id=collection.id, schemata=matchable):
        yield from _query_item(proxy)