Example #1
0
def entities_by_ids(ids,
                    schemata=None,
                    cached=False,
                    includes=None,
                    excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            if cached:
                _cache_entity(entity)
            yield entity
Example #2
0
def iter_entities(authz=None,
                  collection_id=None,
                  schemata=None,
                  includes=None,
                  excludes=None):
    """Scan all entities matching the given criteria."""
    filters = []
    if authz is not None:
        filters.append(authz_query(authz))
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    if ensure_list(schemata):
        filters.append({'terms': {'schemata': ensure_list(schemata)}})
    query = {
        'query': {
            'bool': {
                'filter': filters
            }
        },
        '_source': _source_spec(includes, excludes)
    }
    index = entities_read_index(schema=schemata)
    for res in scan(es, index=index, query=query, scroll='1410m'):
        entity = unpack_result(res)
        if entity is not None:
            yield entity
Example #3
0
def _query_item(entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {
        "query": query,
        "size": 100,
        "_source": {
            "includes": PROXY_INCLUDES
        }
    }
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            log.debug("Match: %s <[%.2f]> %s", entity.caption, score,
                      match.caption)
            yield score, entity, result.get("collection_id"), match
Example #4
0
def iter_entities(authz=None,
                  collection_id=None,
                  schemata=None,
                  includes=None,
                  excludes=None):
    """Scan all entities matching the given criteria."""
    filters = []
    if authz is not None:
        filters.append(authz_query(authz))
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    if ensure_list(schemata):
        filters.append({'terms': {'schemata': ensure_list(schemata)}})
    source = {}
    if ensure_list(includes):
        source['includes'] = ensure_list(includes)
    if ensure_list(excludes):
        source['excludes'] = ensure_list(excludes)
    query = {
        'query': {
            'bool': {
                'filter': filters
            }
        },
        'sort': ['_doc'],
        '_source': source
    }
    for res in scan(es, index=entities_index(), query=query, scroll='1410m'):
        yield unpack_result(res)
Example #5
0
def entities_by_ids(ids,
                    schemata=None,
                    cached=False,
                    includes=None,
                    excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            # Cache entities only briefly to avoid filling up the cache:
            if cached:
                key = cache.object_key(Entity, entity.get('id'))
                cache.set_complex(key, entity, expire=60 * 60)
            yield entity
Example #6
0
File: alerts.py Project: wdsn/aleph
def check_alert(alert_id):
    alert = Alert.by_id(alert_id)
    if alert is None or alert.role is None:
        return
    log.info("Check alert [%s]: %s", alert.id, alert.query)
    authz = Authz.from_role(alert.role)
    query = alert_query(alert, authz)
    index = entities_read_index(schema=Entity.THING)
    result = es.search(index=index, body=query)
    for result in result.get('hits').get('hits', []):
        entity = unpack_result(result)
        if entity is None:
            continue
        log.info('Alert [%s]: %s', alert.query, entity.get('name'))
        params = {
            'alert': alert,
            'role': alert.role,
            'entity': entity.get('id')
        }
        publish(Events.MATCH_ALERT, params=params, channels=[alert.role])
        db.session.flush()

    alert.update()
    db.session.commit()
    db.session.close()
Example #7
0
    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request, query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=es_index, doc_type=TYPE_ENTITY, body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)
Example #8
0
def check_alert(alert_id):
    alert = Alert.by_id(alert_id)
    if alert is None or alert.role is None:
        return
    if not alert.role.is_alertable:
        return
    authz = Authz.from_role(alert.role)
    query = alert_query(alert, authz)
    index = entities_read_index(schema=Entity.THING)
    result = es.search(index=index, body=query)
    for result in result.get('hits').get('hits', []):
        entity = unpack_result(result)
        if entity is None:
            continue
        log.info('Alert [%s]: %s', alert.query, entity.get('name'))
        params = {
            'alert': alert,
            'role': alert.role,
            'entity': entity
        }
        publish(Events.MATCH_ALERT,
                actor_id=entity.get('uploader_id'),
                params=params)

    alert.update()
    db.session.commit()
    db.session.close()
Example #9
0
def iter_entities(
    authz=None,
    collection_id=None,
    schemata=None,
    includes=PROXY_INCLUDES,
    excludes=None,
    filters=None,
    sort=None,
):
    """Scan all entities matching the given criteria."""
    query = {
        "query": _entities_query(filters, authz, collection_id, schemata),
        "_source": _source_spec(includes, excludes),
    }
    preserve_order = False
    if sort is not None:
        query["sort"] = ensure_list(sort)
        preserve_order = True
    index = entities_read_index(schema=schemata)
    for res in scan(
        es,
        index=index,
        query=query,
        timeout=MAX_TIMEOUT,
        request_timeout=MAX_REQUEST_TIMEOUT,
        preserve_order=preserve_order,
    ):
        entity = unpack_result(res)
        if entity is not None:
            yield entity
Example #10
0
def iter_matches(collection, authz):
    """Scan all matching xref results, does not support sorting."""
    filters = [{'term': {'collection_id': collection.id}},
               authz_query(authz, field='match_collection_id')]
    query = {'query': {'bool': {'filter': filters}}}
    for res in scan(es, index=xref_index(), query=query):
        yield unpack_result(res)
Example #11
0
def check_alert(alert_id):
    alert = Alert.by_id(alert_id)
    if alert is None or alert.role is None:
        return
    log.info("Check alert [%s]: %s", alert.id, alert.query)
    authz = Authz.from_role(alert.role)
    try:
        query = alert_query(alert, authz)
        index = entities_read_index(schema=Entity.THING)
        result = es.search(index=index, body=query)
    except RequestError as re:
        log.error("Invalid query [%s]: %r", alert.query, re.error)
        alert.delete()
        db.session.commit()
        return

    for result in result.get("hits").get("hits", []):
        entity = unpack_result(result)
        if entity is None:
            continue
        log.info("Alert [%s]: %s", alert.query, entity.get("id"))
        params = {
            "alert": alert,
            "role": alert.role,
            "entity": entity.get("id"),
            "collection": entity.get("collection_id"),
        }
        channels = [alert.role]
        # channels.append(channel_tag(collection_id, Collection))
        publish(Events.MATCH_ALERT, params=params, channels=channels)

    alert.update()
    db.session.commit()
Example #12
0
def get_collection(collection_id):
    """Fetch a collection from the index."""
    result = es.get(index=collections_index(),
                    doc_type='doc',
                    id=collection_id,
                    ignore=[404],
                    _source_exclude=['text'])
    return unpack_result(result)
Example #13
0
def xref_collection(collection_id, other_id=None):
    """Cross-reference all the entities and documents in a collection."""
    query = {'term': {'collection_id': collection_id}}
    query = {'query': query, '_source': {'excludes': ['text', 'properties.*']}}
    scanner = scan(es, index=entities_index(), query=query, scroll='30m')
    for idx, res in enumerate(scanner):
        res = unpack_result(res)
        _xref_item(res, collection_id=other_id)
Example #14
0
def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type='doc',
                    id=entity_id,
                    ignore=[404],
                    _source_exclude=['text'])
    return unpack_result(result)
Example #15
0
def get_xref(xref_id, collection_id=None):
    """Get an xref match combo by its ID."""
    filters = [{"ids": {"values": [xref_id]}}]
    if collection_id is not None:
        filters.append({"term": {"collection_id": collection_id}})
    query = {"query": {"bool": {"filter": filters}}, "size": 1}
    result = es.search(index=xref_index(), body=query)
    for doc in result.get("hits", {}).get("hits", []):
        return unpack_result(doc)
Example #16
0
 def __init__(self, request, parser, result, schema=None):
     super(SearchQueryResult, self).__init__(request,
                                             parser=parser,
                                             schema=schema)
     self.result = result
     hits = self.result.get('hits', {})
     self.total = hits.get('total')
     for doc in hits.get('hits', []):
         self.results.append(unpack_result(doc))
Example #17
0
def get_xref(xref_id, collection_id=None):
    """Get an xref match combo by its ID."""
    filters = [{'ids': {'values': [xref_id]}}]
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    query = {'query': {'bool': {'filter': filters}}, 'size': 1}
    result = es.search(index=xref_index(), body=query)
    for doc in result.get('hits', {}).get('hits', []):
        return unpack_result(doc)
Example #18
0
def iter_matches(collection, authz):
    """Scan all matching xref results, does not support sorting."""
    filters = [
        {"term": {"collection_id": collection.id}},
        authz_query(authz, field="match_collection_id"),
    ]
    query = {"query": {"bool": {"filter": filters}}}
    for res in scan(es, index=xref_index(), query=query):
        yield unpack_result(res)
Example #19
0
def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=es_index,
                    doc_type=TYPE_ENTITY,
                    id=entity_id,
                    ignore=[404])
    entity = unpack_result(result)
    if entity is not None:
        entity.pop('text', None)
    return entity
Example #20
0
def get_document(document_id):
    """Fetch a document from the index."""
    result = es.get(index=es_index,
                    doc_type=TYPE_DOCUMENT,
                    id=document_id,
                    ignore=[404])
    document = unpack_result(result)
    if document is not None:
        document.pop('text', None)
    return document
Example #21
0
def iter_records(document_id=None, collection_id=None):
    """Scan all records matching the given criteria."""
    filters = []
    if document_id is not None:
        filters.append({'term': {'document_id': document_id}})
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    query = {'query': {'bool': {'filter': filters}}, 'sort': ['_doc']}
    for res in scan(es, index=records_index(), query=query, scroll='1410m'):
        yield unpack_result(res)
Example #22
0
 def __init__(self, request, parser, result):
     super(SearchQueryResult, self).__init__(request, parser=parser)
     self.result = result
     hits = self.result.get('hits', {})
     self.total = hits.get('total')
     for doc in hits.get('hits', []):
         # log.info("Res: %s", pformat(doc))
         doc = unpack_result(doc)
         if doc is not None:
             self.results.append(doc)
Example #23
0
def _counted_msearch(queries, authz, limit=0):
    """Run batched queries to count or retrieve entities with certain
    property values."""
    # The default case for this is that we want to retrieve only the
    # counts for a bunch of filtered sub-queries. In this case, we can
    # group the queries by the affected index.
    # In some cases, the expand API wants to actually retrieve entities.
    # Then, we need to make one query per filter.
    grouped = {}
    for (index, key), query in sorted(queries.items()):
        group = index if limit == 0 else (index, key)
        if group not in grouped:
            grouped[group] = {
                "index": index,
                "filters": [query],
                "counts": {key: query},
            }
        else:
            grouped[group]["filters"].append(query)
            grouped[group]["counts"][key] = query

    log.debug("Counts: %s queries, %s groups", len(queries), len(grouped))

    body = []
    for group in grouped.values():
        body.append({"index": group.get("index")})
        filters = group.get("filters")
        if limit == 0 and len(filters) > 1:
            filters = [{"bool": {"should": filters, "minimum_should_match": 1}}]
        filters.append(authz_query(authz))
        query = {
            "size": limit,
            "query": {"bool": {"filter": filters}},
            "aggs": {"counts": {"filters": {"filters": group.get("counts")}}},
            "_source": ENTITY_SOURCE,
        }
        body.append(query)

    counts = {}
    # FIXME: This doesn't actually retain context on which query a particular
    # entity is a result from. Doesn't matter if all we do in the end is stuff
    # everything into an FtMGraph and then traverse for adjacency.
    entities = []

    if not len(body):
        return entities, counts

    response = es.msearch(body=body)
    for resp in response.get("responses", []):
        for result in resp.get("hits", {}).get("hits", []):
            entities.append(unpack_result(result))
        buckets = resp.get("aggregations", {}).get("counts", {}).get("buckets", {})
        for key, count in buckets.items():
            counts[key] = count.get("doc_count", 0)
    return entities, counts
Example #24
0
def iter_entities(authz=None, collection_id=None, schemata=None,
                  includes=None, excludes=None):
    """Scan all entities matching the given criteria."""
    query = {
        'query': _entities_query(authz, collection_id, schemata),
        '_source': _source_spec(includes, excludes)
    }
    index = entities_read_index(schema=schemata)
    for res in scan(es, index=index, query=query, scroll='1410m'):
        entity = unpack_result(res)
        if entity is not None:
            yield entity
Example #25
0
File: result.py Project: pudo/aleph
 def __init__(self, request, parser, result):
     super(SearchQueryResult, self).__init__(request, parser=parser)
     self.result = result
     hits = self.result.get('hits', {})
     total = hits.get('total', {})
     self.total = total.get('value')
     self.total_type = total.get('relation')
     for doc in hits.get('hits', []):
         # log.info("Res: %s", pformat(doc))
         doc = unpack_result(doc)
         if doc is not None:
             self.results.append(doc)
Example #26
0
 def __init__(self, request, parser, result):
     super(SearchQueryResult, self).__init__(request, parser=parser)
     self.result = result
     hits = self.result.get("hits", {})
     total = hits.get("total", {})
     self.total = total.get("value")
     self.total_type = total.get("relation")
     for doc in hits.get("hits", []):
         # log.info("Res: %s", pformat(doc))
         doc = unpack_result(doc)
         if doc is not None:
             self.results.append(doc)
Example #27
0
def entity_matches(result):
    for doc in result.get('hits').get('hits'):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            'id': proxy.id,
            'name': proxy.caption,
            'n:type': get_freebase_type(proxy.schema),
            'type': [get_freebase_type(proxy.schema)],
            'r:score': doc.get('_score'),
            'uri': entity_url(proxy.id, _relative=True),
            'match': False
        }
Example #28
0
    def _resolve_index(self, cache):
        query = []
        for (type_, id_) in cache.keys():
            if type_ == Role:
                continue
            query.append({'_index': type_, '_id': id_})

        if not len(query):
            return

        results = es.mget(body={'docs': query}, _source_exclude=['text'])
        for doc in results['docs']:
            cache[(doc['_index'], doc['_id'])] = unpack_result(doc)
Example #29
0
def entity_matches(result):
    for doc in result.get("hits").get("hits"):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            "id": proxy.id,
            "name": proxy.caption,
            "n:type": get_freebase_type(proxy.schema),
            "type": [get_freebase_type(proxy.schema)],
            "r:score": doc.get("_score"),
            "uri": entity_url(proxy.id, _relative=True),
            "match": False,
        }
Example #30
0
def get_entity(entity_id):
    """Fetch an entity from the index."""
    if entity_id is None:
        return None
    for index in entities_index_list():
        result = es.get(index=index,
                        doc_type='doc',
                        id=entity_id,
                        ignore=[404],
                        _source_exclude=['text'])
        result = unpack_result(result)
        if result is not None:
            return result
Example #31
0
def entity_matches(result):
    for doc in result.get('hits').get('hits'):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            'id': proxy.id,
            'name': proxy.caption,
            'n:type': get_freebase_type(proxy.schema),
            'type': [get_freebase_type(proxy.schema)],
            'r:score': doc.get('_score'),
            'uri': entity_url(proxy.id, _relative=True),
            'match': False
        }
Example #32
0
def render_notification(stub, notification):
    """Generate a text version of the notification, suitable for use
    in an email or text message."""
    from aleph.logic import resolver

    notification = unpack_result(notification)
    event = Events.get(notification.get("event"))
    if event is None:
        return

    for name, clazz, value in _iter_params(notification, event):
        resolver.queue(stub, clazz, value)
    resolver.resolve(stub)
    plain = str(event.template)
    html = str(event.template)
    for name, clazz, value in _iter_params(notification, event):
        data = resolver.get(stub, clazz, value)
        if data is None:
            return
        link, title = None, None
        if clazz == Role:
            title = data.get("label")
        elif clazz == Alert:
            title = data.get("query")
        elif clazz == Collection:
            title = data.get("label")
            link = collection_url(value)
        elif clazz == Entity:
            proxy = model.get_proxy(data)
            title = proxy.caption
            link = entity_url(value)
        elif clazz == EntitySet:
            title = data.label
            link = entityset_url(data.id)
        elif clazz == Export:
            title = data.get("label")
            link = archive_url(
                data.get("content_hash"),
                file_name=data.get("file_name"),
                mime_type=data.get("file_name"),
            )
            link = url_for("exports_api.download", export_id=data.get("id"))

        template = "{{%s}}" % name
        html = html.replace(template, html_link(title, link))
        plain = plain.replace(template, "'%s'" % title)
        if name == event.link_to:
            plain = "%s (%s)" % (plain, link)
    return {"plain": plain, "html": html}
Example #33
0
def iter_entities(authz=None, collection_id=None, schemata=None,
                  includes=None, excludes=None):
    """Scan all entities matching the given criteria."""
    filters = []
    if authz is not None:
        filters.append(authz_query(authz))
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    if ensure_list(schemata):
        filters.append({'terms': {'schemata': ensure_list(schemata)}})
    query = {
        'query': {'bool': {'filter': filters}},
        '_source': _source_spec(includes, excludes)
    }
    index = entities_read_index(schema=schemata)
    for res in scan(es, index=index, query=query, scroll='1410m'):
        entity = unpack_result(res)
        if entity is not None:
            yield entity
Example #34
0
File: xref.py Project: pudo/aleph
def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other
Example #35
0
def entities_by_ids(ids, schemata=None, cached=False,
                    includes=None, excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            # Cache entities only briefly to avoid filling up the cache:
            if cached:
                key = cache.object_key(Entity, entity.get('id'))
                cache.set_complex(key, entity, expire=60 * 60)
            yield entity