Beispiel #1
0
def statistics():
    enable_cache(vary_user=True)
    documents = documents_query(QueryState({}, request.authz, limit=0))
    entities = entities_query(QueryState({}, request.authz, limit=0))
    return jsonify({
        'documents_count': documents.get('total'),
        'entities_count': entities.get('total'),
        'collections_count': len(request.authz.collections_read)
    })
Beispiel #2
0
def generate_leads(entity_id):
    """Compute likely duplicates of a given entity and index these leads."""
    # Get rid of everything, also for deleted entities etc.
    delete_entity_leads(entity_id)

    entity = load_entity(entity_id)
    if entity is None:
        # log.warning("[%r] not indexed, skip lead generation.", entity_id)
        return
    if not entity.get('collection_id'):
        # log.warning("[%r] is not in a collecton, skip lead generation.", entity_id)  # noqa
        return

    log.debug("Generating leads for [%(id)s]: %(name)s", entity)
    authz = Authz(override=True)
    judgements = EntityIdentity.judgements_by_entity(entity_id)
    state = QueryState({}, authz, limit=100)
    result = similar_entities(entity, state)
    for other in result.get('results', []):
        score = entity_distance(entity, other)
        log.debug(" -[%.2f]-> %s", score, other.get('name'))
        # TODO: implement some cut-off
        index_lead({
            'entity_id': entity.get('id'),
            'entity_collection_id': entity.get('collection_id'),
            'score': score,
            'judgement': judgements.get(other.get('id'), 0),
            'match_id': other.get('id'),
            'schema': other.get('schema'),
            'schemata': other.get('schemata'),
            'collection_id': other.get('collection_id'),
            'dataset': other.get('dataset'),
            'roles': other.get('roles')
        })
Beispiel #3
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    state = QueryState({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [make_fingerprint(name)],
        'schemata': ensure_list(query.get('type'))
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    suggested = similar_entities(entity, state)
    matches = []
    for ent in suggested.get('results'):
        types = [t for t in get_freebase_types() if ent['schema'] == t['id']]
        matches.append({
            'id': ent.get('id'),
            'name': ent.get('name'),
            'type': types,
            'score': min(100, ent.get('score') * 10),
            'uri': entity_link(ent.get('id')),
            'match': ent.get('name') == name
        })
    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
Beispiel #4
0
def check_role_alerts(authz):
    alerts = Alert.by_role(authz.role).all()
    if not len(alerts):
        return
    log.info('Alerting %r, %d alerts...', authz.role, len(alerts))
    for alert in alerts:
        args = {
            'q': alert.query_text,
            'filter:entities.id': alert.entity_id,
            'limit': 50
        }
        state = QueryState(args, authz)
        results = documents_query(state, since=alert.notified_at)
        if results['total'] == 0:
            continue
        log.info('Found %d new results for: %r', results['total'], alert.label)
        alert.update()
        try:
            subject = '%s (%s new results)' % (alert.label, results['total'])
            html = render_template('email/alert.html',
                                   alert=alert,
                                   role=authz.role,
                                   total=results.get('total'),
                                   results=format_results(alert, results),
                                   app_title=app_title,
                                   app_url=app_url)
            notify_role(authz.role, subject, html)
        except Exception as ex:
            log.exception(ex)
    db.session.commit()
Beispiel #5
0
def similar(id):
    entity, _ = get_entity(id, request.authz.READ)
    schema = schemata.get(entity.get('schema'))
    if not schema.fuzzy:
        return jsonify({'status': 'ignore', 'results': [], 'total': 0})
    state = QueryState(request.args, request.authz)
    combined = combined_entity(entity)
    return jsonify(similar_entities(combined, state))
Beispiel #6
0
def export():
    state = QueryState(request.args, request.authz, limit=0)
    log_event(request)
    output = make_excel(get_results(state, 50000), FIELDS)
    return send_file(output,
                     mimetype=XLSX_MIME,
                     as_attachment=True,
                     attachment_filename='export.xlsx')
Beispiel #7
0
def index():
    enable_cache(vary_user=True)
    results = [d for d in datasets if request.authz.check_roles(d.roles)]
    state = QueryState({
        'filter:dataset': [d.name for d in results],
        'facet': 'dataset',
        'limit': 0
    }, request.authz)
    res = entities_query(state)
    values = res.get('facets', {}).get('dataset', {}).get('values', [])
    counts = {v.get('id'): v.get('count') for v in values}

    countries_facet = defaultdict(int)
    category_facet = defaultdict(int)
    countries_filter = set(request.args.getlist('filter:countries'))
    category_filter = set(request.args.getlist('filter:category'))

    filtered = []
    for dataset in results:
        dataset.entities_count = counts.get(dataset.name)
        if len(category_filter) and dataset.category not in category_filter:
            continue
        if len(countries_filter) and \
           not len(countries_filter.intersection(dataset.countries)):
            continue
        for country in dataset.countries:
            countries_facet[country] += 1
        category_facet[dataset.category] += 1
        filtered.append(dataset)

    filtered = sorted(filtered, key=lambda d: d.entities_count, reverse=True)
    facets = {'countries': {'values': []}, 'category': {'values': []}}
    categories = get_config('COLLECTION_CATEGORIES', {})

    countries_facet = sorted(countries_facet.items(), key=lambda (k, c): c)
    for key, count in countries_facet[::-1]:
        facets['countries']['values'].append({
            'id': key,
            'count': count,
            'label': COUNTRY_NAMES.get(key, key)
        })

    category_facet = sorted(category_facet.items(), key=lambda (k, c): c)
    for key, count in category_facet[::-1]:
        if key is None:
            continue
        facets['category']['values'].append({
            'id': key,
            'count': count,
            'label': categories.get(key, key)
        })

    return jsonify({
        'results': filtered,
        'facets': facets,
        'total': len(filtered),
        'total_entities_count': res.get('total')
    })
Beispiel #8
0
def query():
    enable_cache(vary_user=True)
    state = QueryState(request.args, request.authz)
    result = documents_query(state)
    params = next_params(request.args, result)
    log_event(request)
    if params is not None:
        result['next'] = url_for('search_api.query', **params)
    return jsonify(result)
Beispiel #9
0
def peek():
    if not get_config('ALLOW_PEEKING', True):
        return jsonify({'active': False})
    enable_cache(vary_user=True)
    state = QueryState(request.args, request.authz)
    response = peek_query(state)
    if not request.authz.logged_in:
        response.pop('roles', None)
    return jsonify(response)
Beispiel #10
0
def records(document_id):
    document = get_document(document_id)
    enable_cache(vary_user=True)
    state = QueryState(request.args, request.authz)
    query = records_query(document.id, state)
    result = execute_records_query(document.id, state, query)
    params = next_params(request.args, result)
    if params is not None:
        result['next'] = url_for('documents_api.records',
                                 document_id=document_id,
                                 **params)
    return jsonify(result)
Beispiel #11
0
def view(name):
    enable_cache(vary_user=True)
    try:
        dataset = datasets.get(name)
    except NameError:
        raise NotFound()
    request.authz.require(request.authz.check_roles(dataset.roles))
    state = QueryState({
        'filter:dataset': dataset.name,
        'facet': ['schema', 'countries'],
        'limit': 0
    }, request.authz)
    res = entities_query(state)
    data = dataset.to_dict()
    data['facets'] = res.get('facets', {})
    data['doc_count'] = res.get('total')
    return jsonify(data)
Beispiel #12
0
def index():
    # allow to filter for writeable collections only, needed
    # in some UI scenarios:
    state = QueryState(request.args, request.authz)
    permission = request.args.get('permission')
    if permission not in [request.authz.READ, request.authz.WRITE]:
        permission = request.authz.READ
    collections = request.authz.collections[permission]

    # Other filters for navigation
    label = request.args.get('label')
    managed = state.getbool('managed', None)

    # Include counts (of entities, documents) in list view?
    counts = state.getbool('counts', False)

    def converter(colls):
        return [c.to_dict(counts=counts) for c in colls]

    facet = [f.lower().strip() for f in request.args.getlist('facet')]
    q = Collection.find(label=label,
                        countries=state.getfilter('countries'),
                        category=state.getfilter('category'),
                        collection_id=collections,
                        managed=managed)
    data = Pager(q).to_dict(results_converter=converter)
    facets = {}
    if 'countries' in facet:
        facets['countries'] = {
            'values':
            Collection.facet_by(q, Collection.countries, mapping=COUNTRY_NAMES)
        }
    if 'category' in facet:
        mapping = get_config('COLLECTION_CATEGORIES', {})
        facets['category'] = {
            'values': Collection.facet_by(q,
                                          Collection.category,
                                          mapping=mapping)
        }
    data['facets'] = facets
    return jsonify(data)
Beispiel #13
0
def documents(id):
    entity, _ = get_entity(id, request.authz.READ)
    state = QueryState(request.args, request.authz)
    combined = combined_entity(entity)
    return jsonify(entity_documents(combined, state))
Beispiel #14
0
def links(id):
    entity, obj = get_entity(id, request.authz.READ)
    state = QueryState(request.args, request.authz)
    return jsonify(links_query(entity, state))
Beispiel #15
0
def index():
    enable_cache(vary_user=True)
    state = QueryState(request.args, request.authz)
    doc_counts = state.getbool('doc_counts')
    res = entities_query(state, doc_counts=doc_counts)
    return jsonify(res)
Beispiel #16
0
def index(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    request.authz.require(request.authz.collection_read(collection))
    state = QueryState(request.args, request.authz)
    results = leads_query(collection_id, state)
    return jsonify(results)