Python get_proxy Beispiele, followthemoney.model.get_proxy Python Beispiele

Beispiel #1

0

Datei anzeigen

def _iter_match_batch(batch, authz):
    matchable = [s.name for s in model if s.matchable]
    entities = set()
    for match in batch:
        entities.add(match.entity_id)
        entities.add(match.match_id)

    entities = entities_by_ids(list(entities), schemata=matchable)
    entities = {e.get('id'): e for e in entities}
    for obj in batch:
        if not authz.can(obj.match_collection_id, authz.READ):
            continue
        entity = entities.get(str(obj.entity_id))
        match = entities.get(str(obj.match_id))
        collection = get_collection(obj.match_collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        yield (
            int(obj.score * 100),
            eproxy.caption,
            _format_date(eproxy),
            _format_country(eproxy),
            collection.get('label'),
            mproxy.caption,
            _format_date(mproxy),
            _format_country(mproxy),
            entity_url(eproxy.id),
            entity_url(mproxy.id),
        )

Beispiel #2

0

Datei anzeigen

def _iter_match_batch(stub, sheet, batch):
    matchable = [s.name for s in model if s.matchable]
    entities = set()
    for match in batch:
        entities.add(match.get("entity_id"))
        entities.add(match.get("match_id"))
        resolver.queue(stub, Collection, match.get("match_collection_id"))

    resolver.resolve(stub)
    entities = entities_by_ids(list(entities), schemata=matchable)
    entities = {e.get("id"): e for e in entities}

    for obj in batch:
        entity = entities.get(str(obj.get("entity_id")))
        match = entities.get(str(obj.get("match_id")))
        collection_id = obj.get("match_collection_id")
        collection = resolver.get(stub, Collection, collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        sheet.append(
            [
                obj.get("score"),
                eproxy.caption,
                _format_date(eproxy),
                _format_country(eproxy),
                collection.get("label"),
                mproxy.caption,
                _format_date(mproxy),
                _format_country(mproxy),
                entity_url(eproxy.id),
                entity_url(mproxy.id),
            ]
        )

Beispiel #3

0

Datei anzeigen

Datei: xref.py Projekt: mustafaascha/aleph

def _iter_match_batch(batch, authz):
    entities = set()
    collections = set()
    for match in batch:
        entities.add(match.entity_id)
        entities.add(match.match_id)
        collections.add(match.match_collection_id)

    collections = Collection.all_by_ids(collections, authz=authz)
    collections = {c.id: c.label for c in collections}
    entities = iter_entities_by_ids(list(entities), authz=authz)
    entities = {e.get('id'): e for e in entities}
    for obj in batch:
        entity = entities.get(str(obj.entity_id))
        match = entities.get(str(obj.match_id))
        collection = collections.get(obj.match_collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        yield (
            int(obj.score * 100),
            eproxy.caption,
            _format_date(eproxy),
            _format_country(eproxy),
            collection,
            mproxy.caption,
            _format_date(mproxy),
            _format_country(mproxy),
            entity_url(eproxy.id),
            entity_url(mproxy.id),
        )

Beispiel #4

0

Datei anzeigen

    def test_name_entity(self):
        proxy = model.get_proxy(
            {
                "id": "banana",
                "schema": "Person",
                "properties": {
                    "name": ["Carl", "Karl", "Carlo", "CARL"],
                },
            }
        )
        name_entity(proxy)
        name = proxy.get("name")
        assert 1 == len(name), name
        assert name[0] not in proxy.get("alias"), proxy.get("alias")

        proxy = model.get_proxy(
            {
                "id": "banana",
                "schema": "Person",
                "properties": {
                    "name": ["Carl"],
                },
            }
        )
        name_entity(proxy)
        assert ["Carl"] == proxy.get("name"), proxy.get("name")

Beispiel #5

0

Datei anzeigen

Datei: entities.py Projekt: catskillmarina/aleph

def update_entity(collection, entity_id=None):
    """Update xref and aggregator after an entity has been edited."""
    from aleph.logic.xref import xref_entity
    from aleph.logic.profiles import profile_fragments

    log.info("[%s] Update entity: %s", collection, entity_id)
    entity = index.get_entity(entity_id)
    proxy = model.get_proxy(entity)
    if collection.casefile:
        xref_entity(collection, proxy)

    aggregator = get_aggregator(collection, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=entity_id)

    # Inline name properties from adjacent entities. See the
    # docstring on `inline_names` for a more detailed discussion.
    prop = proxy.schema.get("namesMentioned")
    if prop is not None:
        entity_ids = proxy.get_type_values(registry.entity)
        names = set()
        for related in index.entities_by_ids(entity_ids):
            related = model.get_proxy(related)
            names.update(related.get_type_values(registry.name))

        if len(names) > 0:
            name_proxy = model.make_entity(proxy.schema)
            name_proxy.id = proxy.id
            name_proxy.add(prop, names)
            aggregator.put(name_proxy, fragment="names")

    index_aggregator(collection, aggregator, entity_ids=[entity_id])
    refresh_entity(collection, proxy.id)

Beispiel #6

0

Datei anzeigen

Datei: test_helpers.py Projekt: alephdata/followthemoney

 def test_combine_names(self):
     proxy = model.get_proxy({
         "id": "banana",
         "schema": "Person",
         "properties": {
             "firstName": ["Vladimir", "Wladimir"],
             "fatherName": ["Vladimirovitch"],
             "lastName": ["Putin"],
         },
     })
     combine_names(proxy)
     assert "Vladimir Putin" in proxy.get("alias"), proxy.get("alias")
     assert "Vladimir Vladimirovitch Putin" in proxy.get(
         "alias"), proxy.get("alias")
     proxy = model.get_proxy({
         "id": "banana",
         "schema": "Person",
         "properties": {
             "name": ["Vladimir Putin"],
         },
     })
     combine_names(proxy)
     proxy = model.get_proxy({
         "id": "banana",
         "schema": "Person",
         "properties": {
             "lastName": ["Putin"],
         },
     })
     combine_names(proxy)
     assert "Putin" in proxy.get("alias"), proxy.get("alias")

Beispiel #7

0

Datei anzeigen

Datei: xref.py Projekt: pudo/aleph

def _iter_match_batch(batch, authz):
    matchable = [s.name for s in model if s.matchable]
    entities = set()
    for match in batch:
        entities.add(match.entity_id)
        entities.add(match.match_id)

    entities = entities_by_ids(list(entities), schemata=matchable)
    entities = {e.get('id'): e for e in entities}
    for obj in batch:
        if not authz.can(obj.match_collection_id, authz.READ):
            continue
        entity = entities.get(str(obj.entity_id))
        match = entities.get(str(obj.match_id))
        collection = get_collection(obj.match_collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        yield (
            int(obj.score * 100),
            eproxy.caption,
            _format_date(eproxy),
            _format_country(eproxy),
            collection.get('label'),
            mproxy.caption,
            _format_date(mproxy),
            _format_country(mproxy),
            entity_url(eproxy.id),
            entity_url(mproxy.id),
        )

Beispiel #8

0

Datei anzeigen

 def test_base_functions(self):
     a = model.get_proxy(ENTITY)
     b = model.get_proxy(ENTITY)
     mpp = MultiPartProxy(proxies=[a, b])
     assert len(mpp.proxies) == 1
     assert len(mpp.get("name")) == 1
     assert mpp.get("name")[0] == "Ralph Tester"
     mpp_keys = mpp.properties.keys()
     assert set(a.properties.keys()) == mpp_keys
     mpp_inv_keys = mpp.get_type_inverted().keys()
     assert set(a.get_type_inverted().keys()) == mpp_inv_keys

Beispiel #9

0

Datei anzeigen

    def test_compare_quality(self):
        entity = model.get_proxy(ENTITY)
        best_score = compare(model, entity, entity)
        reduced = deepcopy(ENTITY)
        reduced["properties"].pop("birthDate")
        reduced["properties"].pop("idNumber")
        reduced_proxy = model.get_proxy(reduced)
        self.assertLess(compare(model, entity, reduced_proxy), best_score)

        reduced = deepcopy(ENTITY)
        reduced["properties"]["name"] = ["Frank Banana"]
        reduced_proxy = model.get_proxy(reduced)
        self.assertLess(compare(model, entity, reduced_proxy), best_score)

Beispiel #10

0

Datei anzeigen

    def test_compare_basic(self):
        entity = model.get_proxy(ENTITY)
        best_score = compare(model, entity, entity)
        assert best_score > 0.5, best_score
        comp = model.get_proxy({"schema": "RealEstate"})
        self.assertAlmostEqual(compare(model, entity, comp), 0)
        self.assertAlmostEqual(compare(model, comp, comp), 0)

        comp = model.get_proxy({"schema": "Person"})
        self.assertAlmostEqual(compare(model, entity, comp), 0)

        comp = model.get_proxy({"schema": "LegalEntity"})
        self.assertAlmostEqual(compare(model, entity, comp), 0)

Beispiel #11

0

Datei anzeigen

Datei: test_compare.py Projekt: wayne9qiu/followthemoney

 def test_compare_names(self):
     left = {"schema": "Person", "properties": {"name": ["mr frank banana"]}}  # noqa
     left = model.get_proxy(left)
     right = {
         "schema": "Person",
         "properties": {"name": ["mr frank bananoid"]},
     }  # noqa
     right = model.get_proxy(right)
     same_score = compare_names(left, left)
     assert same_score > 0.5, same_score
     lr_score = compare_names(left, right)
     assert lr_score > 0.1, lr_score
     assert lr_score < same_score, (lr_score, same_score)

Beispiel #12

0

Datei anzeigen

Datei: profiles.py Projekt: alephdata/followthemoney-compare

 def add_decision(self, decision):
     assert decision["entityset_id"] == self.pid
     judgement = decision["judgement"] = Judgement(decision["judgement"])
     self.decisions[decision["id"]] = decision
     try:
         data = model.get_proxy(decision.pop("entity"))
         self.proxies[model.get_proxy(data)] = judgement
     except (TypeError, InvalidData, KeyError):
         pass
     if decision["compared_to_entity_id"]:
         try:
             data = decision.pop("compared_to_entity")
             self.proxies.setdefault(model.get_proxy(data), Judgement.POSITIVE)
         except (TypeError, InvalidData, KeyError):
             pass

Beispiel #13

0

Datei anzeigen

def iter_proxies(**kw):
    includes = ['schema', 'properties']
    for data in iter_entities(includes=includes, **kw):
        schema = model.get(data.get('schema'))
        if schema is None:
            continue
        yield model.get_proxy(data)

Beispiel #14

0

Datei anzeigen

Datei: entities.py Projekt: wayne9qiu/aleph

def upsert_entity(data, collection, authz=None, sync=False):
    """Create or update an entity in the database. This has a side hustle
    of migrating entities created via the _bulk API or a mapper to a
    database entity in the event that it gets edited by the user.
    """
    entity = None
    entity_id = collection.ns.sign(data.get("id"))
    if entity_id is not None:
        entity = Entity.by_id(entity_id, collection=collection)
    if entity is None:
        role_id = authz.id if authz is not None else None
        entity = Entity.create(data, collection, role_id=role_id)
    else:
        entity.update(data, collection)

    # Inline name properties from adjacent entities. See the
    # docstring on `inline_names` for a more detailed discussion.
    proxy = entity.to_proxy()
    entity_ids = proxy.get_type_values(registry.entity)
    for rel in index.entities_by_ids(entity_ids):
        inline_names(proxy, model.get_proxy(rel))
    entity.data = proxy.properties
    db.session.add(entity)

    delete_aggregator_entity(collection, entity.id)
    index.index_proxy(collection, proxy, sync=sync)
    refresh_entity(collection, entity.id)
    return entity.id

Beispiel #15

0

Datei anzeigen

Datei: entity_graph.py Projekt: alephdata/followthemoney-graph

 def from_file(cls, fd):
     G = cls()
     try:
         total = os.fstat(fd.fileno()).st_size
     except AttributeError:
         total = None
     with tqdm(total=total, unit="B", unit_scale=True,
               unit_divisor=1024) as pbar:
         data = ((json.loads(line), len(line)) for line in fd)
         data_group = groupby(data, lambda d: d[0].get("profile_id"))
         for node_id, group in data_group:
             for proxy_dict, line_length in group:
                 node_flags = proxy_dict.pop("flags", {})
                 proxy = model.get_proxy(proxy_dict)
                 try:
                     node, is_new = G.add_proxy(proxy, node_id=node_id)
                 except InvalidData as e:
                     print(e)
                     print(proxy.id)
                     print(proxy.properties)
                     print(proxy.schema)
                     print("-------------------------")
                     print(node_id)
                     print(G.get_node(node_id).properties)
                     print(G.get_node(node_id).schema)
                     print("=========================")
                 node.flags = node_flags
                 pbar.update(line_length)
     return G

Beispiel #16

0

Datei anzeigen

def xref_item(stage, collection, entity_id=None, against_collection_ids=None):
    "Cross-reference an entity against others to generate potential matches."
    entity_ids = [entity_id]
    # This is running as a background job. In order to avoid running each
    # entity one by one, we do it 101 at a time. This avoids sending redudant
    # queries to the database and elasticsearch, making cross-ref much faster.
    for task in stage.get_tasks(limit=100):
        entity_ids.append(task.payload.get('entity_id'))
    stage.mark_done(len(entity_ids) - 1)
    # log.debug("Have %d entity IDs for xref", len(entity_ids))
    for data in entities_by_ids(entity_ids, includes=['schema', 'properties']):
        proxy = model.get_proxy(data)
        # log.info("XRef: %r", proxy)
        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == proxy.id)
        dq.delete()
        matches = xref_query_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection.id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
    db.session.commit()

Beispiel #17

0

Datei anzeigen

Datei: export.py Projekt: pudo/aleph

def export_entities(request, result, format):
    assert format in (FORMAT_CSV, FORMAT_EXCEL)
    entities = []
    for entity in result.results:
        resolver.queue(result, Collection, entity.get('collection_id'))
        entities.append(model.get_proxy(entity))
    resolver.resolve(result)
    zip_archive = zipstream.ZipFile()

    if format == FORMAT_EXCEL:
        workbook = get_workbook()
        for entity in entities:
            collection_id = entity.context.get('collection_id')
            collection = resolver.get(result, Collection, collection_id)
            export_entity_excel(workbook, collection, entity)
            write_document(zip_archive, collection, entity)
        content = io.BytesIO(get_workbook_content(workbook))
        zip_archive.write_iter('export.xlsx', content)
    elif format == FORMAT_CSV:
        handlers = {}
        for entity in entities:
            collection_id = entity.context.get('collection_id')
            collection = resolver.get(result, Collection, collection_id)
            export_entity_csv(handlers, collection, entity)
            write_document(zip_archive, collection, entity)

        for key in handlers:
            content = handlers[key]
            content.seek(0)
            content = io.BytesIO(content.read().encode())
            zip_archive.write_iter(key+'.csv', content)
    for chunk in zip_archive:
        yield chunk

Beispiel #18

0

Datei anzeigen

 def test_to_dict(self):
     proxy = model.get_proxy(ENTITY)
     graph = Graph(edge_types=registry.pivots)
     graph.add(proxy)
     data = graph.to_dict()
     assert 'nodes' in data, data
     assert 'edges' in data, data

Beispiel #19

0

Datei anzeigen

def similar(entity_id):
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    entity = model.get_proxy(entity)
    record_audit(Audit.ACT_ENTITY, id=entity_id)
    result = MatchQuery.handle(request, entity=entity)
    return EntitySerializer.jsonify_result(result)

Beispiel #20

0

Datei anzeigen

 def handle(self, task):
     manager = Manager(task.stage, task.context)
     entity = model.get_proxy(task.payload)
     log.debug("Ingest: %r", entity)
     manager.ingest_entity(entity)
     manager.close()
     self.dispatch_next(task, manager.emitted)

Beispiel #21

0

Datei anzeigen

Datei: b3959bf8cc66_entity_ids.py Projekt: wayne9qiu/aleph

def upgrade():
    bind = op.get_bind()
    meta = sa.MetaData()
    meta.bind = bind
    meta.reflect()
    entity_table = meta.tables["entity"]
    collection_table = meta.tables["collection"]
    q = sa.select([collection_table])
    crp = bind.execute(q)
    for collection in crp.fetchall():
        ns = Namespace(collection.foreign_id)
        q = sa.select([entity_table])
        q = q.where(entity_table.c.collection_id == collection.id)
        erp = bind.execute(q)
        while True:
            entity = erp.fetchone()
            if not entity:
                break
            proxy = model.get_proxy(
                {
                    "id": entity.id,
                    "schema": entity.schema,
                    "properties": entity.data
                },
                cleaned=False,
            )
            proxy.add("name", entity.name, quiet=True, cleaned=False)
            proxy = ns.apply(proxy)
            q = sa.update(entity_table)
            q = q.where(entity_table.c.id == entity.id)
            q = q.values(id=proxy.id, data=proxy.properties)
            bind.execute(q)

    op.drop_column("entity", "foreign_id")
    op.drop_column("entity", "name")

Beispiel #22

0

Datei anzeigen

Datei: entities_api.py Projekt: seekersapp2013/aleph

def similar(entity_id):
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get('collection_id'))
    entity = model.get_proxy(entity)
    result = MatchQuery.handle(request, entity=entity)
    return EntitySerializer.jsonify_result(result)

Beispiel #23

0

Datei anzeigen

Datei: __init__.py Projekt: pudo/aleph

def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    proxy = model.get_proxy(entity)
    Thing = model.get(Entity.THING)
    types = [registry.name, registry.email, registry.identifier,
             registry.iban, registry.phone, registry.address]
    facets = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for type_ in types:
        if type_.group is None:
            continue
        for fidx, value in enumerate(proxy.get_type_values(type_)):
            if type_.specificity(value) < 0.1:
                continue
            schemata = model.get_type_schemata(type_)
            schemata = [s for s in schemata if s.is_a(Thing)]
            index = entities_read_index(schemata)
            alias = '%s_%s' % (type_.name, fidx)
            facets.append((index, alias, type_.group, type_.group, value))

    res = _filters_faceted_query(authz, facets)
    for (_, alias, field, _, value) in facets:
        total = res.get(alias, 0)
        if total > 1:
            yield (field, value, total)

Beispiel #24

0

Datei anzeigen

def bulk_write(collection, items):
    """Write a set of entities - given as raw dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data")

        entity = model.get_proxy(item)
        if entity.id is None:
            raise InvalidData("No ID for entity")

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities)

Beispiel #25

0

Datei anzeigen

Datei: xref.py Projekt: mustafaascha/aleph

def xref_collection(collection_id):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_proxies(collection_id=collection_id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        entity_id, document_id = None, None
        if Document.SCHEMA in proxy.schema.names:
            document_id = proxy.id
        else:
            entity_id = proxy.id

        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == entity_id)
        dq = dq.filter(Match.document_id == document_id)
        dq.delete()
        matches = xref_item(proxy)
        for (score, other_id, other) in matches:
            log.info("Xref [%.1f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = entity_id
            obj.document_id = document_id
            obj.collection_id = collection_id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()

Beispiel #26

0

Datei anzeigen

def tags(entity_id):
    """
    ---
    get:
      summary: Get entity tags
      description: >-
        Get tags for the entity with id `entity_id`.
      parameters:
      - in: path
        name: entity_id
        required: true
        schema:
          type: string
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                type: object
                allOf:
                - $ref: '#/components/schemas/QueryResponse'
                properties:
                  results:
                    type: array
                    items:
                      $ref: '#/components/schemas/EntityTag'
      tags:
      - Entity
    """
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get("collection_id"))
    results = entity_tags(model.get_proxy(entity), request.authz)
    return jsonify({"status": "ok", "total": len(results), "results": results})

Beispiel #27

0

Datei anzeigen

 def apply(self, count, results):
     self.count = count
     for result in results:
         proxy = model.get_proxy(result)
         proxy.context = {}
         self.entities.append(proxy)
         self.graph.add(proxy)

Beispiel #28

0

Datei anzeigen

def view(entity_id):
    """
    ---
    get:
      summary: Get an entity
      description: Return the entity with id `entity_id`
      parameters:
      - in: path
        name: entity_id
        required: true
        schema:
          type: string
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Entity'
      tags:
      - Entity
    """
    enable_cache()
    excludes = ["text", "numeric.*"]
    entity = get_index_entity(entity_id, request.authz.READ, excludes=excludes)
    tag_request(collection_id=entity.get("collection_id"))
    proxy = model.get_proxy(entity)
    html = proxy.first("bodyHtml", quiet=True)
    source_url = proxy.first("sourceUrl", quiet=True)
    encoding = proxy.first("encoding", quiet=True)
    entity["safeHtml"] = sanitize_html(html, source_url, encoding=encoding)
    entity["shallow"] = False
    return EntitySerializer.jsonify(entity)

Beispiel #29

0

Datei anzeigen

Datei: export.py Projekt: aaronarnold2/aleph

def export_entities(export_id, result):
    from aleph.logic import resolver

    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    try:
        entities = []
        stub = types.SimpleNamespace(result=result)
        for entity in result["results"]:
            resolver.queue(stub, Collection, entity.get("collection_id"))
            entities.append(model.get_proxy(entity))
        resolver.resolve(stub)

        file_path = export_dir.joinpath("query-export.zip")
        zf = zipfile.ZipFile(file_path, "w")
        exporter = ExcelExporter(None, extra=EXTRA_HEADERS)
        for entity in entities:
            collection_id = entity.context.get("collection_id")
            collection = resolver.get(stub, Collection, collection_id)
            extra = [entity_url(entity.id), collection.get("label")]
            exporter.write(entity, extra=extra)
            write_document(export_dir, zf, collection, entity)
        content = exporter.get_bytesio().getvalue()
        zf.writestr("Export.xlsx", content)
        zf.close()
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Export.STATUS_FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)

Beispiel #30

0

Datei anzeigen

Datei: entities.py Projekt: pudo/aleph

def iter_proxies(**kw):
    includes = ['schema', 'properties']
    for data in iter_entities(includes=includes, **kw):
        schema = model.get(data.get('schema'))
        if schema is None:
            continue
        yield model.get_proxy(data)

Beispiel #31

0

Datei anzeigen

def add_entities(G, *entities):
    new_proxies = []
    for entity in entities:
        proxy = model.get_proxy(entity)
        if G.add_proxy(proxy):
            new_proxies.append(proxy)
    return new_proxies

Beispiel #32

0

Datei anzeigen

Datei: processing.py Projekt: djoffrey/aleph

def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)

Beispiel #33

0

Datei anzeigen

 def to_proxy(self):
     proxy = model.get_proxy({
         'id': str(self.id),
         'schema': self.model,
         'properties': {}
     })
     meta = dict(self.meta)
     headers = meta.pop('headers', {}) or {}
     headers = {slugify(k, sep='_'): v for k, v in headers.items()}
     proxy.set('contentHash', self.content_hash)
     proxy.set('parent', self.parent_id)
     proxy.set('ancestors', self.ancestors)
     proxy.set('crawler', meta.get('crawler'))
     proxy.set('sourceUrl', meta.get('source_url'))
     proxy.set('title', meta.get('title'))
     proxy.set('fileName', meta.get('file_name'))
     if not proxy.has('fileName'):
         disposition = headers.get('content_disposition')
         if disposition is not None:
             _, attrs = cgi.parse_header(disposition)
             proxy.set('fileName', attrs.get('filename'))
     proxy.set('mimeType', meta.get('mime_type'))
     if not proxy.has('mimeType'):
         proxy.set('mimeType', headers.get('content_type'))
     proxy.set('language', meta.get('languages'))
     proxy.set('country', meta.get('countries'))
     proxy.set('keywords', meta.get('keywords'))
     proxy.set('headers', registry.json.pack(headers), quiet=True)
     proxy.set('authoredAt', meta.get('authored_at'))
     proxy.set('modifiedAt', meta.get('modified_at'))
     proxy.set('publishedAt', meta.get('published_at'))
     proxy.set('retrievedAt', meta.get('retrieved_at'))
     proxy.set('indexUpdatedAt', self.created_at)
     proxy.set('sourceUrl', meta.get('source_url'))
     return proxy

Beispiel #34

0

Datei anzeigen

Datei: xref.py Projekt: aaronarnold2/aleph

def _query_item(entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {
        "query": query,
        "size": 100,
        "_source": {
            "includes": PROXY_INCLUDES
        }
    }
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            log.debug("Match: %s <[%.2f]> %s", entity.caption, score,
                      match.caption)
            yield score, entity, result.get("collection_id"), match

Beispiel #35

0

Datei anzeigen

Datei: entities_api.py Projekt: pudo/aleph

def similar(entity_id):
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get('collection_id'))
    entity = model.get_proxy(entity)
    record_audit(Audit.ACT_ENTITY, id=entity_id)
    result = MatchQuery.handle(request, entity=entity)
    return EntitySerializer.jsonify_result(result)

Beispiel #36

0

Datei anzeigen

Datei: entity.py Projekt: pudo/aleph

 def to_proxy(self):
     proxy = model.get_proxy({
         'id': self.id,
         'schema': self.schema,
         'properties': self.data
     })
     proxy.add('name', self.name)
     return proxy

Beispiel #37

0

Datei anzeigen

Datei: entities_api.py Projekt: pudo/aleph

def match():
    entity = parse_request(EntityUpdateSchema)
    record_audit(Audit.ACT_MATCH, entity=entity)
    entity = model.get_proxy(entity)
    tag_request(schema=entity.schema.name, caption=entity.caption)
    collection_ids = request.args.getlist('collection_ids')
    result = MatchQuery.handle(request, entity=entity,
                               collection_ids=collection_ids)
    return EntitySerializer.jsonify_result(result)

Beispiel #38

0

Datei anzeigen

Datei: document.py Projekt: pudo/aleph

    def to_proxy(self):
        meta = dict(self.meta)
        headers = meta.pop('headers', {})
        headers = {slugify(k, sep='_'): v for k, v in headers.items()}
        proxy = model.get_proxy({
            'id': str(self.id),
            'schema': self.model,
            'properties': meta
        })
        proxy.set('contentHash', self.content_hash)
        proxy.set('parent', self.parent_id)
        proxy.set('ancestors', self.ancestors)
        proxy.set('processingStatus', self.status)
        proxy.set('processingError', self.error_message)
        proxy.set('fileSize', meta.get('file_size'))
        proxy.set('fileName', meta.get('file_name'))
        if not proxy.has('fileName'):
            disposition = headers.get('content_disposition')
            if disposition is not None:
                _, attrs = cgi.parse_header(disposition)
                proxy.set('fileName', attrs.get('filename'))
        proxy.set('mimeType', meta.get('mime_type'))
        if not proxy.has('mimeType'):
            proxy.set('mimeType', headers.get('content_type'))
        proxy.set('language', meta.get('languages'))
        proxy.set('country', meta.get('countries'))
        proxy.set('authoredAt', meta.get('authored_at'))
        proxy.set('modifiedAt', meta.get('modified_at'))
        proxy.set('publishedAt', meta.get('published_at'))
        proxy.set('retrievedAt', meta.get('retrieved_at'))
        proxy.set('sourceUrl', meta.get('source_url'))
        proxy.set('messageId', meta.get('message_id'), quiet=True)
        proxy.set('inReplyTo', meta.get('in_reply_to'), quiet=True)
        proxy.set('bodyText', self.body_text, quiet=True)
        proxy.set('bodyHtml', self.body_raw, quiet=True)
        columns = meta.get('columns')
        proxy.set('columns', registry.json.pack(columns), quiet=True)
        proxy.set('headers', registry.json.pack(headers), quiet=True)

        pdf = 'application/pdf'
        if meta.get('extension') == 'pdf' or proxy.first('mimeType') == pdf:
            proxy.set('pdfHash', self.content_hash, quiet=True)
        proxy.add('pdfHash', meta.get('pdf_version'), quiet=True)

        q = db.session.query(DocumentTag)
        q = q.filter(DocumentTag.document_id == self.id)
        q = q.filter(DocumentTag.type.in_(DocumentTag.MAPPING.keys()))
        q = q.order_by(DocumentTag.weight.desc())
        q = q.limit(Document.MAX_TAGS)
        for tag in q.all():
            prop = DocumentTag.MAPPING.get(tag.type)
            if prop is not None:
                proxy.add(prop, tag.text)
        return proxy

Beispiel #39

0

Datei anzeigen

Datei: reconcile_api.py Projekt: pudo/aleph

def entity_matches(result):
    for doc in result.get('hits').get('hits'):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            'id': proxy.id,
            'name': proxy.caption,
            'n:type': get_freebase_type(proxy.schema),
            'type': [get_freebase_type(proxy.schema)],
            'r:score': doc.get('_score'),
            'uri': entity_url(proxy.id, _relative=True),
            'match': False
        }

Beispiel #40

0

Datei anzeigen

Datei: entities_api.py Projekt: pudo/aleph

def content(entity_id):
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get('collection_id'))
    for entity in entities_by_ids([entity_id],
                                  schemata=entity.get('schema'),
                                  excludes=['text']):
        proxy = model.get_proxy(entity)
        record_audit(Audit.ACT_ENTITY, id=entity_id)
        html = sanitize_html(proxy.first('bodyHtml', quiet=True),
                             proxy.first('sourceUrl', quiet=True))
        headers = proxy.first('headers', quiet=True)
        headers = registry.json.unpack(headers)
        return jsonify({
            'headers': headers,
            'text': proxy.first('bodyText', quiet=True),
            'html': html
        })
    return ('', 404)

Beispiel #41

0

Datei anzeigen

Datei: xref.py Projekt: pudo/aleph

def xref_collection(collection_id, against_collection_ids=None):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_proxies(collection_id=collection_id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == proxy.id)
        dq.delete()
        matches = xref_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection_id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()

Beispiel #42

0

Datei anzeigen

Datei: entities.py Projekt: pudo/aleph

def _index_updates(collection_id, entities, merge=True):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    indexes = defaultdict(list)
    timestamps = {}
    common = {
        'collection_id': collection_id,
        'updated_at': datetime.utcnow(),
        'bulk': True
    }

    if merge:
        for result in entities_by_ids(list(entities.keys())):
            existing = model.get_proxy(result)
            indexes[existing.id].append(result.get('_index'))
            entities[existing.id].merge(existing)
            timestamps[existing.id] = result.get('created_at')

    actions = []
    for entity_id, proxy in entities.items():
        entity = dict(common)
        created_at = timestamps.get(proxy.id, common.get('updated_at'))
        entity['created_at'] = created_at
        entity.update(proxy.to_full_dict())
        _, index, body = index_operation(entity)
        for other in indexes.get(entity_id, []):
            if other != index:
                # log.info("Delete ID [%s] from index: %s", entity_id, other)
                actions.append(delete_operation(other, entity_id))
        actions.append({
            '_id': entity_id,
            '_index': index,
            '_source': body
        })
    return actions

Beispiel #43

0

Datei anzeigen

Datei: xref.py Projekt: pudo/aleph

def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other

Beispiel #44

0

Datei anzeigen

Datei: bulk.py Projekt: pudo/aleph

def bulk_write(collection, items, merge=True, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        if not unsafe:
            entity = namespace.apply(entity)
            entity = remove_checksums(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)

Beispiel #45

0

Datei anzeigen

Datei: entity.py Projekt: pudo/aleph

 def update(self, entity):
     proxy = model.get_proxy(entity)
     proxy.schema.validate(entity)
     self.apply_proxy(proxy)
     self.updated_at = datetime.utcnow()
     db.session.add(self)