Beispiel #1
0
def get_instance_stats(authz):
    # Compute entity stats:
    query = {
        'size': 0,
        'query': {
            'bool': {
                'filter':
                [authz_query(authz), {
                    'term': {
                        'schemata': Entity.THING
                    }
                }]
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(), body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    # Compute collection stats (should we return categories?)
    query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}}
    result = es.search(index=collections_index(), body=query)
    data['collections'] = result.get('hits').get('total')
    return data
Beispiel #2
0
def execute_tabular_query(document_id, table_id, args, query):
    """ Execute a query against records and return a set of results. """
    result = es.search(index=es_index, doc_type=TYPE_RECORD, body=query)
    hits = result.get('hits', {})
    output = {
        'status': 'ok',
        'results': [],
        'offset': query['from'],
        'limit': query['size'],
        'total': hits.get('total'),
        'next': None
    }
    next_offset = output['offset'] + output['limit']
    if output['total'] > next_offset:
        params = {'offset': next_offset}
        for k, v in args.iterlists():
            if k in ['offset']:
                continue
            params[k] = v
        output['next'] = url_for('table.rows',
                                 document_id=document_id,
                                 table_id=table_id,
                                 **params)

    for rec in hits.get('hits', []):
        record = rec.get('_source').get('raw')
        record['_id'] = rec.get('_source', {}).get('row_id')
        output['results'].append(record)
    return output
Beispiel #3
0
def get_sitemap_entities(collection_id):
    filters = [
        {
            'term': {
                'collection_id': collection_id
            }
        },
        {
            'term': {
                'schemata': Entity.THING
            }
        },
    ]
    query = {
        'query': {
            'bool': {
                'filter': filters
            }
        },
        'size': MAX_PAGE,
        'sort': [{
            'updated_at': 'desc'
        }],
        '_source': {
            'includes': ['schema', 'updated_at']
        }
    }
    index = entities_read_index(Entity.THING)
    res = es.search(index=index, body=query)
    for res in res.get('hits', {}).get('hits', []):
        source = res.get('_source')
        source['id'] = res.get('_id')
        yield source
Beispiel #4
0
def _query_item(entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {
        "query": query,
        "size": 100,
        "_source": {
            "includes": PROXY_INCLUDES
        }
    }
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            log.debug("Match: %s <[%.2f]> %s", entity.caption, score,
                      match.caption)
            yield score, entity, result.get("collection_id"), match
Beispiel #5
0
def entities_by_ids(ids,
                    schemata=None,
                    cached=False,
                    includes=None,
                    excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            # Cache entities only briefly to avoid filling up the cache:
            if cached:
                key = cache.object_key(Entity, entity.get('id'))
                cache.set_complex(key, entity, expire=60 * 60)
            yield entity
Beispiel #6
0
def get_notifications(role, since=None, parser=None):
    """Fetch a stream of notifications for the given role."""
    channels = get_role_channels(role)
    filters = [{'terms': {'channels': channels}}]
    if since is not None:
        filters.append({'range': {'created_at': {'gt': since}}})
    must_not = [{'term': {'actor_id': role.id}}]
    query = {
        'size': 30,
        'query': {
            'bool': {
                'filter': filters,
                'must_not': must_not
            }
        },
        'sort': [{
            'created_at': {
                'order': 'desc'
            }
        }]
    }
    if parser is not None:
        query['size'] = parser.limit
        query['from'] = parser.offset
    return es.search(index=notifications_index(), body=query)
Beispiel #7
0
def check_alert(alert_id):
    alert = Alert.by_id(alert_id)
    if alert is None or alert.role is None:
        return
    log.info("Check alert [%s]: %s", alert.id, alert.query)
    authz = Authz.from_role(alert.role)
    try:
        query = alert_query(alert, authz)
        index = entities_read_index(schema=Entity.THING)
        result = es.search(index=index, body=query)
    except RequestError as re:
        log.error("Invalid query [%s]: %r", alert.query, re.error)
        alert.delete()
        db.session.commit()
        return

    for result in result.get("hits").get("hits", []):
        entity = unpack_result(result)
        if entity is None:
            continue
        log.info("Alert [%s]: %s", alert.query, entity.get("id"))
        params = {
            "alert": alert,
            "role": alert.role,
            "entity": entity.get("id"),
            "collection": entity.get("collection_id"),
        }
        channels = [alert.role]
        # channels.append(channel_tag(collection_id, Collection))
        publish(Events.MATCH_ALERT, params=params, channels=channels)

    alert.update()
    db.session.commit()
Beispiel #8
0
 def search(self):
     """Execute the query as assmbled."""
     # log.info("Search index: %s", self.get_index())
     result = es.search(index=self.get_index(), body=self.get_body())
     log.info("Took: %sms", result.get('took'))
     # log.info("%s", pformat(result.get('profile')))
     return result
Beispiel #9
0
def get_collection_facet(collection_id, facet, refresh=False):
    """Compute some statistics on the content of a collection."""
    key = cache.object_key(Collection, collection_id, facet)
    data = cache.get_complex(key)
    if not refresh and data is not None:
        return data

    query = {'term': {'collection_id': collection_id}}
    query = {
        'size': 0,
        'query': {'bool': {'filter': [query]}},
        'aggs': {
            'values': {'terms': {'field': facet, 'size': 300}},
            'total': {'cardinality': {'field': facet}}
        }
    }
    schemata = set()
    facet_type = registry.groups.get(facet)
    if facet_type is not None:
        schemata = model.get_type_schemata(facet_type)
    result = es.search(index=entities_read_index(schema=schemata),
                       body=query,
                       request_timeout=3600,
                       timeout='20m')
    aggregations = result.get('aggregations')
    values = {}
    for bucket in aggregations.get('values').get('buckets', []):
        values[bucket['key']] = bucket['doc_count']
    data = {
        'values': values,
        'total': aggregations.get('total').get('value', 0)
    }
    cache.set_complex(key, data, expires=cache.EXPIRE)
    return data
Beispiel #10
0
def suggest_entities(prefix, authz, min_count=0, schemas=None, size=5):
    """Auto-complete API."""
    options = []
    if prefix is not None and len(prefix.strip()):
        q = {'match_phrase_prefix': {'name': prefix.strip()}}
        if min_count > 0:
            q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}})
        if schemas is not None and len(schemas):
            q = add_filter(q, {'terms': {'$schema': schemas}})

        # TODO: is this correct? should we allow filter by dataset entities?
        q = add_filter(q, {'terms': {'collection_id': authz.collections_read}})

        q = {
            'size': size,
            'sort': [{
                'doc_count': 'desc'
            }, '_score'],
            'query': q,
            '_source': ['name', 'schema', 'fingerprints', 'doc_count']
        }
        ref = ascii_text(prefix)
        result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [ascii_text(t) for t in ent.pop('fingerprints', [])]
            ent['match'] = ref in terms
            ent['score'] = res.get('_score')
            ent['id'] = res.get('_id')
            options.append(ent)
    return {'prefix': prefix, 'results': options}
Beispiel #11
0
def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data
Beispiel #12
0
 def search(self):
     """Execute the query as assmbled."""
     # pprint(self.get_body())
     return es.search(index=self.get_index(),
                      body=self.get_body(),
                      request_cache=True,
                      request_timeout=REQUEST_TIMEOUT)
Beispiel #13
0
def entities_by_ids(ids,
                    schemata=None,
                    cached=False,
                    includes=None,
                    excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            if cached:
                _cache_entity(entity)
            yield entity
Beispiel #14
0
def check_alert(alert_id):
    alert = Alert.by_id(alert_id)
    if alert is None or alert.role is None:
        return
    if not alert.role.is_alertable:
        return
    authz = Authz.from_role(alert.role)
    query = alert_query(alert, authz)
    index = entities_read_index(schema=Entity.THING)
    result = es.search(index=index, body=query)
    for result in result.get('hits').get('hits', []):
        entity = unpack_result(result)
        if entity is None:
            continue
        log.info('Alert [%s]: %s', alert.query, entity.get('name'))
        params = {
            'alert': alert,
            'role': alert.role,
            'entity': entity
        }
        publish(Events.MATCH_ALERT,
                actor_id=entity.get('uploader_id'),
                params=params)

    alert.update()
    db.session.commit()
    db.session.close()
Beispiel #15
0
def check_alert(alert_id):
    alert = Alert.by_id(alert_id)
    if alert is None or alert.role is None:
        return
    log.info("Check alert [%s]: %s", alert.id, alert.query)
    authz = Authz.from_role(alert.role)
    query = alert_query(alert, authz)
    index = entities_read_index(schema=Entity.THING)
    result = es.search(index=index, body=query)
    for result in result.get('hits').get('hits', []):
        entity = unpack_result(result)
        if entity is None:
            continue
        log.info('Alert [%s]: %s', alert.query, entity.get('name'))
        params = {
            'alert': alert,
            'role': alert.role,
            'entity': entity.get('id')
        }
        publish(Events.MATCH_ALERT, params=params, channels=[alert.role])
        db.session.flush()

    alert.update()
    db.session.commit()
    db.session.close()
Beispiel #16
0
def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'types': {
                'terms': {
                    'field': '_type',
                    'size': len(TYPES)
                }
            }
        }
    }
    result = es.search(index=es_index, doc_type=TYPES.keys(), body=query)
    aggregations = result.get('aggregations')
    data = {'$total': result.get('hits').get('total'), '$schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['$schemata'][key] = schema.get('doc_count')

    for doc_type in aggregations.get('types').get('buckets'):
        key = TYPES.get(doc_type.get('key'))
        data[key] = doc_type.get('doc_count')
    return data
Beispiel #17
0
 def search(self):
     """Execute the query as assmbled."""
     # log.info("Search index: %s", self.get_index())
     result = es.search(index=self.get_index(), body=self.get_body())
     log.info("[%s] took: %sms", self.to_text(), result.get("took"))
     # log.info("%s", pformat(self.get_body()))
     # log.info("%s", pformat(self.parser.filters))
     return result
Beispiel #18
0
 def search(self):
     """Execute the query as assmbled."""
     # log.info("Search index: %s", self.get_index())
     result = es.search(index=self.get_index(),
                        body=self.get_body())
     log.info("Took: %sms", result.get('took'))
     # log.info("%s", pformat(result.get('profile')))
     return result
Beispiel #19
0
def console():
    query = {'query': {
        'term': {#'collection': 'sec-edgar',
                 'filed_at': '20160216'}
    }
    }
    res = es.search(body=query,index='aleph')
    import ipdb; ipdb.set_trace()
Beispiel #20
0
def get_dataset_countries(dataset_name):
    """Create a list of the top 300 countries mentioned in a dataset."""
    q = {'term': {'dataset': dataset_name}}
    aggs = {'countries': {'terms': {'field': 'countries', 'size': 300}}}
    q = {'size': 0, 'query': q, 'aggregations': aggs}
    result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q)
    result = result.get('aggregations', {}).get('countries', {})
    return [b.get('key') for b in result.get('buckets', [])]
Beispiel #21
0
def execute_documents_query(args, q):
    """ Execute the query and return a set of results. """
    result = es.search(index=es_index, doc_type=TYPE_DOCUMENT, body=q)
    hits = result.get('hits', {})
    output = {
        'status': 'ok',
        'results': [],
        'offset': q['from'],
        'limit': q['size'],
        'total': hits.get('total'),
        'next': None,
        'facets': {},
        'watchlists': {}
    }
    convert_aggregations(result, output, args)
    next_offset = output['offset'] + output['limit']
    if output['total'] > next_offset:
        params = {'offset': next_offset}
        for k, v in args.iterlists():
            if k in ['offset']:
                continue
            params[k] = v
        output['next'] = url_for('search.query', **params)

    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], args)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        document['api_url'] = url_for('document.view',
                                      document_id=doc.get('_id'))
        document['data_url'] = url_for('document.file',
                                       document_id=doc.get('_id'))
        output['results'].append(document)

    if len(sub_queries):
        res = es.msearch(index=es_index, doc_type=TYPE_RECORD,
                         body='\n'.join(sub_queries))
        for doc in output['results']:
            for sq in res.get('responses', []):
                sqhits = sq.get('hits', {})
                for hit in sqhits.get('hits', {}):
                    record = hit.get('_source')
                    if doc['id'] != record.get('document_id'):
                        continue
                    record['score'] = hit.get('_score')
                    record['text'] = hit.get('highlight', {}).get('text')
                    doc['records']['results'].append(record)
                    doc['records']['total'] = sqhits.get('total', 0)

    return output
Beispiel #22
0
def get_xref(xref_id, collection_id=None):
    """Get an xref match combo by its ID."""
    filters = [{"ids": {"values": [xref_id]}}]
    if collection_id is not None:
        filters.append({"term": {"collection_id": collection_id}})
    query = {"query": {"bool": {"filter": filters}}, "size": 1}
    result = es.search(index=xref_index(), body=query)
    for doc in result.get("hits", {}).get("hits", []):
        return unpack_result(doc)
Beispiel #23
0
 def search(self):
     """Execute the query as assmbled."""
     result = es.search(index=self.get_index(),
                        body=self.get_body(),
                        request_cache=True,
                        request_timeout=REQUEST_TIMEOUT)
     log.info("Took: %sms", result.get('took'))
     # log.info("%s", pformat(result))
     return result
Beispiel #24
0
def get_xref(xref_id, collection_id=None):
    """Get an xref match combo by its ID."""
    filters = [{'ids': {'values': [xref_id]}}]
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    query = {'query': {'bool': {'filter': filters}}, 'size': 1}
    result = es.search(index=xref_index(), body=query)
    for doc in result.get('hits', {}).get('hits', []):
        return unpack_result(doc)
Beispiel #25
0
def search_safe(*args, **kwargs):
    # This is not supposed to be used in every location where search is
    # run, but only where it's a backend search that we could back off of
    # without hurting UX.
    for attempt in count():
        try:
            return es.search(*args, **kwargs)
        except Exception as exc:
            log.warning("Search error: %s", exc)
        backoff_cluster(failures=attempt)
Beispiel #26
0
def console():
    query = {
        'query': {
            'term': {  #'collection': 'sec-edgar',
                'filed_at': '20160216'
            }
        }
    }
    res = es.search(body=query, index='aleph')
    import ipdb
    ipdb.set_trace()
Beispiel #27
0
 def result(self):
     if self._result is None:
         q = self.query.copy()
         q['from'] = self._offset
         q['size'] = self._limit
         # HACKY
         if q['from'] > 0:
             del q['aggregations']
         self._result = es.search(index=es_index,
                                  doc_type=self.doc_type,
                                  body=q)
     return self._result
Beispiel #28
0
 def result(self):
     if self._result is None:
         q = self.query.copy()
         q['from'] = self._offset
         q['size'] = self._limit
         # HACKY
         if q['from'] > 0:
             del q['aggregations']
         self._result = es.search(index=es_index,
                                  doc_type=self.doc_type,
                                  query=q)
     return self._result
Beispiel #29
0
 def search(self):
     """Execute the query as assmbled."""
     # log.info("Search index: %s", self.get_index())
     result = es.search(index=self.get_index(), body=self.get_body())
     # log.info(
     #     f"Elasticsearch query [{self.to_text()}] took {result.get('took')}ms",
     #     query=self.to_text(),
     #     took=result.get("took"),
     # )
     # log.info("%s", pformat(self.get_body()))
     # log.info("%s", pformat(self.parser.filters))
     return result
Beispiel #30
0
def peek_query(state):
    """Peek into hidden collections.

    This allows users to retrieve an approximate result count of a given query
    against those collections which they are not authorised to view. It is a
    rudimentary collaboration mechanism.
    """
    filters = state.filters
    cq = Collection.all()
    cq = cq.filter(not_(Collection.id.in_(state.authz.collections_read)))
    cq = cq.filter(Collection.creator_id != None)  # noqa
    cq = cq.filter(Collection.private != True)  # noqa
    collections = {c.id: c for c in cq}
    filters['collection_id'] = collections.keys()

    q = text_query(state.text)
    q = {
        'query': filter_query(q, filters),
        'query': q,
        'size': 0,
        'aggregations': {
            'collections': {
                'terms': {'field': 'collection_id', 'size': 1000}
            }
        },
        '_source': False
    }
    result = es.search(index=es_index, body=q, doc_type=TYPE_DOCUMENT)
    roles = {}
    total = 0
    aggs = result.get('aggregations', {}).get('collections', {})
    for bucket in aggs.get('buckets', []):
        collection = collections.get(bucket.get('key'))
        if collection is None or collection.creator is None:
            continue
        total += bucket.get('doc_count')
        if collection.creator_id in roles:
            roles[collection.creator_id]['total'] += bucket.get('doc_count')
        else:
            roles[collection.creator_id] = {
                'name': collection.creator.name,
                'email': collection.creator.email,
                'total': bucket.get('doc_count')
            }

    roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True)
    roles = [format_total(r) for r in roles]
    return format_total({
        'roles': roles,
        'active': total > 0,
        'total': total
    })
Beispiel #31
0
def execute_basic(doc_type, query):
    """Common part of running a particular query."""
    result = es.search(index=es_index, doc_type=doc_type, body=query)
    hits = result.get('hits', {})
    output = {
        'status': 'ok',
        'results': [],
        'offset': query.get('from', 0),
        'limit': query.get('size'),
        'total': hits.get('total'),
        'next': None
    }
    return result, hits, output
Beispiel #32
0
def raw_iter(query, total=10000):
    for page in count(0):
        query['from'] = PAGE * page
        if query['from'] >= total:
            return
        query['size'] = PAGE
        result = es.search(index=es_index, doc_type=DOC_TYPE, body=query)
        hits = result.get('hits', {})
        for doc in hits.get('hits', []):
            yield doc

        if not hits.get('total') > PAGE * (page + 1):
            return
def get_instance_stats(authz):
    # Compute entity stats:
    query = {
        'size': 0,
        'query': {
            'bool': {
                'filter': [
                    authz_query(authz),
                    # {'term': {'schemata': Entity.THING}}
                ]
            }
        }
    }
    entities = es.search(index=entities_index(), body=query)

    # Compute collection stats (should we return categories?)
    query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}}
    collections = es.search(index=collections_index(), body=query)
    return {
        'entities': entities.get('hits').get('total'),
        'collections': collections.get('hits').get('total')
    }
Beispiel #34
0
def raw_iter(query):
    for page in count(0):
        query['from'] = PAGE * page
        query['size'] = PAGE
        result = es.search(index=es_index,
                           doc_type=TYPE_DOCUMENT,
                           body=query)
        hits = result.get('hits', {})
        for doc in hits.get('hits', []):
            yield doc

        if not hits.get('total') > PAGE * (page + 1):
            return
Beispiel #35
0
def get_notifications(role, since=None):
    """Fetch a stream of notifications for the given role."""
    channels = get_role_channels(role)
    filters = [{"terms": {"channels": channels}}]
    if since is not None:
        filters.append({"range": {"created_at": {"gt": since}}})
    must_not = [{"term": {"actor_id": role.id}}]
    query = {
        "size": 30,
        "query": {"bool": {"filter": filters, "must_not": must_not}},
        "sort": [{"created_at": {"order": "desc"}}],
    }
    return es.search(index=notifications_index(), body=query)
Beispiel #36
0
def search_safe(*args, **kwargs):
    # This is not supposed to be used in every location where search is
    # run, but only where it's a backend search that we could back off of
    # without hurting UX.
    for attempt in range(REQUEST_RETRIES):
        try:
            kwargs['doc_type'] = 'doc'
            return es.search(*args, **kwargs)
        except RequestError:
            raise
        except Exception as exc:
            log.warning("Search error: %s", exc)
        backoff_cluster(failures=attempt)
Beispiel #37
0
def random_docs(howmany=100, offset=0):
    index = 'aleph'
    query = {
        "from": offset,
        "size": howmany,
        "query": {
            "function_score": {
                "functions": [{
                    "random_score": {"seed": 42}}]
            }
        }
    }
    results = es.search(index=index,body=query)
    return results['hits']['hits']
Beispiel #38
0
def raw_iter(query, total=10000):
    for page in count(0):
        query['from'] = PAGE * page
        if query['from'] >= total:
            return
        query['size'] = PAGE
        result = es.search(index=es_index,
                           doc_type=DOC_TYPE,
                           query=query)
        hits = result.get('hits', {})
        for doc in hits.get('hits', []):
            yield doc

        if not hits.get('total') > PAGE * (page + 1):
            return
Beispiel #39
0
def replace_es(query, updatefunc, index='aleph_test', howmany=10):
    perpage = 50
    start = 522050
    for offset in range(start,howmany,perpage):
        print('# %s' % offset)
        results = es.search(
            index=index,
            body=query,
            from_=offset,
            size=min(perpage, howmany))
        for result in results['hits']['hits']:
            newbody = updatefunc(result['_source'])
            if not newbody:
                print('skipping item')
                continue
            updated = es.index(
                index=result['_index'],
                doc_type=result['_type'],
                id = result['_id'],
                body = newbody)
            assert updated['created'] == False
Beispiel #40
0
def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other
Beispiel #41
0
def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    log.info("Generating collection stats: %s", collection_id)
    query = {'term': {'collection_id': collection_id}}
    query = {
        'size': 0,
        'query': {'bool': {'filter': [query]}},
        'aggs': {
            'schemata': {'terms': {'field': 'schema', 'size': 1000}},
            'countries': {'terms': {'field': 'countries', 'size': 1000}},
            'languages': {'terms': {'field': 'languages', 'size': 1000}},
        }
    }
    index = entities_read_index(schema=Entity.THING)
    result = es.search(index=index, body=query)
    aggregations = result.get('aggregations', {})
    data = {'count': 0}
    for facet in ['schemata', 'countries', 'languages']:
        data[facet] = {}
        for bucket in aggregations.get(facet, {}).get('buckets', []):
            data[facet][bucket['key']] = bucket['doc_count']
    if len(data['schemata']):
        data['count'] = sum(data['schemata'].values())
    return data
Beispiel #42
0
def entities_by_ids(ids, schemata=None, cached=False,
                    includes=None, excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            # Cache entities only briefly to avoid filling up the cache:
            if cached:
                key = cache.object_key(Entity, entity.get('id'))
                cache.set_complex(key, entity, expire=60 * 60)
            yield entity
Beispiel #43
0
def available_attributes(args, sources=None, lists=None):
    q = attributes_query(args, sources=sources, lists=lists)
    result = es.search(index=es_index, doc_type=DOC_TYPE, query=q)
    result = result.get("aggregations", {}).get("attributes", {})
    result = {r.get("key"): False for r in result.get("buckets", [])}
    return {"fields": CORE_FIELDS, "attributes": result}
Beispiel #44
0
def available_attributes(args, sources=None, lists=None):
    q = attributes_query(args, sources=sources, lists=lists)
    result = es.search(index=es_index, doc_type=DOC_TYPE, body=q)
    result = result.get('aggregations', {}).get('attributes', {})
    result = {r.get('key'): False for r in result.get('buckets', [])}
    return {'fields': CORE_FIELDS, 'attributes': result}