def sample_entities(secret, properties, schematas, seed, sample_pct, limit, outfile): """Sample random entities""" random.seed(seed) authz = Authz.from_role(Role.load_cli_user()) collections = list(Collection.all_by_secret(secret, authz)) random.shuffle(collections) iter_proxies_kwargs = { "authz": authz, "schemata": schematas or None, "randomize": True, "random_seed": seed, } n_entities = 0 for collection in collections: for entity in iter_proxies(collection_id=collection.id, **iter_proxies_kwargs): if properties and not any( entity.properties.get(prop) for prop in properties): continue if not sample_pct or random.random() < sample_pct: write_object(outfile, entity) n_entities += 1 if limit and n_entities >= limit: return
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("query-export.zip") with ZipFile(file_path, mode="w") as zf: excel_path = export_dir.joinpath(EXCEL_FILE) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for entity in iter_proxies(filters=filters): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= Export.MAX_FILE_SIZE: log.warn("Export too large: %r", export) break exporter.finalize() zf.write(excel_path, arcname=EXCEL_FILE) complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def xref_collection(collection_id): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_proxies(collection_id=collection_id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) entity_id, document_id = None, None if Document.SCHEMA in proxy.schema.names: document_id = proxy.id else: entity_id = proxy.id dq = db.session.query(Match) dq = dq.filter(Match.entity_id == entity_id) dq = dq.filter(Match.document_id == document_id) dq.delete() matches = xref_item(proxy) for (score, other_id, other) in matches: log.info("Xref [%.1f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = entity_id obj.document_id = document_id obj.collection_id = collection_id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def dump_entities(foreign_id, outfile): """Export FtM entities for the given collection.""" collection = get_collection(foreign_id) for entity in iter_proxies(collection_id=collection.id, excludes=['text']): entity.context = {} write_object(outfile, entity)
def export_collection(collection): uri = URIRef(ui_url('collections', collection.id)) g = Graph() g.add((uri, RDF.type, DCMI.Collection)) g.add((uri, RDFS.label, Literal(collection.label))) g.add((uri, DCMI.identifier, Literal(collection.foreign_id))) g.add((uri, ALEPH.category, ALEPH[collection.category])) yield from itergraph(g) for entity in iter_proxies(collection_id=collection.id): yield from itergraph(export_entity(entity, uri))
def _iter_mentions(collection): """Combine mentions into pseudo-entities used for xref.""" proxy = model.make_entity(Entity.LEGAL_ENTITY) for mention in iter_proxies( collection_id=collection.id, schemata=["Mention"], sort={"properties.resolved": "desc"}, ): if mention.first("resolved") != proxy.id: if proxy.id is not None: yield proxy proxy = model.make_entity(Entity.LEGAL_ENTITY) proxy.id = mention.first("resolved") _merge_schemata(proxy, mention.get("detectedSchema")) proxy.add("name", mention.get("name")) proxy.add("country", mention.get("contextCountry")) if proxy.id is not None: yield proxy
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("export.zip") with ZipFile(file_path, mode="w") as zf: excel_name = safe_filename(export.label, extension="xlsx") excel_path = export_dir.joinpath(excel_name) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for idx, entity in enumerate(iter_proxies(filters=filters)): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE: concern = "total size of the" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break if idx >= settings.EXPORT_MAX_RESULTS: concern = "number of" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break exporter.finalize() zf.write(excel_path, arcname=excel_name) file_name = "Export: %s" % export.label file_name = safe_filename(file_name, extension="zip") complete_export(export_id, file_path, file_name) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def xref_collection(collection_id, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_proxies(collection_id=collection_id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection_id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def xref_collection(queue, collection, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] count = count_entities(collection_id=collection.id, schemata=matchable) queue.progress.mark_pending(count) entities = iter_proxies(collection_id=collection.id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection.id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit() queue.progress.mark_finished() queue.remove()
def _query_entities(collection): """Generate matches for indexing.""" log.info("[%s] Generating entity-based xref...", collection) matchable = [s for s in model if s.matchable] for proxy in iter_proxies(collection_id=collection.id, schemata=matchable): yield from _query_item(proxy)
def dump_entities(foreign_id, outfile): """Export FtM entities for the given collection.""" collection = get_collection(foreign_id) for entity in iter_proxies(collection_id=collection.id): write_object(outfile, entity)
def _query_entities(collection): """Generate matches for indexing.""" matchable = [s.name for s in model if s.matchable] for proxy in iter_proxies(collection_id=collection.id, schemata=matchable): yield from _query_item(proxy)