Beispiel #1
0
def match_query(proxy, source_collection_id=None, collection_ids=None,
                query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity and source collection_id:
    must_not = []
    if proxy.id is not None:
        must_not.append({"ids": {"values": [proxy.id]}})
    if source_collection_id is not None:
        must_not.append({'term': {'collection_id': source_collection_id}})
    if len(must_not):
        query['bool']['must_not'].extend(must_not)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    filters = []
    for (prop, value) in proxy.itervalues():
        specificity = prop.specificity(value)
        if specificity > 0:
            filters.append((prop, value, specificity))

    filters = sorted(filters, key=lambda p: p[2], reverse=True)
    required = []
    for (prop, value, specificity) in filters:
        if prop.type in REQUIRED and len(required) <= MAX_CLAUSES:
            required.extend(_make_queries(prop, value, specificity))

    scoring = []
    for (prop, value, specificity) in filters:
        clauses = len(required) + len(scoring)
        if prop.type not in REQUIRED and clauses <= MAX_CLAUSES:
            scoring.extend(_make_queries(prop, value, specificity))

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': required,
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Beispiel #2
0
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    filters = []
    for (prop, value) in proxy.itervalues():
        specificity = prop.specificity(value)
        if specificity > 0:
            filters.append((prop, value, specificity))

    filters = sorted(filters, key=lambda p: p[2], reverse=True)
    required = []
    for (prop, value, specificity) in filters:
        if prop.type in REQUIRED and len(required) <= MAX_CLAUSES:
            required.extend(_make_queries(prop, value, specificity))

    scoring = []
    for (prop, value, specificity) in filters:
        clauses = len(required) + len(scoring)
        if prop.type not in REQUIRED and clauses <= MAX_CLAUSES:
            scoring.extend(_make_queries(prop, value, specificity))

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': [required],
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Beispiel #3
0
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    required = []
    scoring = []
    for (prop, value) in proxy.itervalues():
        queries = list(_make_queries(prop, value))
        if prop.type in REQUIRED:
            required.extend(queries)
        else:
            scoring.extend(queries)

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': [required],
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Beispiel #4
0
def iter_entities_by_ids(ids, authz=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    for i in range(0, len(ids), MAX_PAGE):
        chunk = ids[i:i + MAX_PAGE]
        if not len(chunk):
            return
        query = bool_query()
        query['bool']['filter'].append({'ids': {'values': chunk}})
        if authz is not None:
            query['bool']['filter'].append(authz_query(authz))
        includes = ['schema', 'properties', 'collection_id', 'created_at']
        query = {
            'query': query,
            '_source': {'includes': includes},
            'size': min(MAX_PAGE, len(chunk) * 2)
        }
        result = search_safe(index=entity_index(),
                             body=query,
                             request_cache=False)
        for doc in result.get('hits', {}).get('hits', []):
            entity = unpack_result(doc)
            if entity is not None:
                yield entity
Beispiel #5
0
def entities_by_ids(ids, authz=None, schemata=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    for i in range(0, len(ids), MAX_PAGE):
        chunk = ids[i:i + MAX_PAGE]
        if not len(chunk):
            return
        query = bool_query()
        query['bool']['filter'].append({'ids': {'values': chunk}})
        if authz is not None:
            query['bool']['filter'].append(authz_query(authz))
        query = {
            'query': query,
            '_source': {
                'excludes': ['text']
            },
            'size': min(MAX_PAGE, len(chunk))
        }
        index = entities_read_index(schema=schemata)
        result = search_safe(index=index, body=query, ignore=[404])
        for doc in result.get('hits', {}).get('hits', []):
            entity = unpack_result(doc)
            if entity is not None:
                yield entity
Beispiel #6
0
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    # Attempt to find only matches within the "matchable" set of
    # entity schemata. For example, a Company and be matched to
    # another company or a LegalEntity, but not a Person.
    # Real estate is "unmatchable", i.e. even if two plots of land
    # have almost the same name and criteria, it does not make
    # sense to suggest they are the same.
    if proxy.schema.name != Entity.THING:
        matchable = [s.name for s in proxy.schema.matchable_schemata]
        if not len(matchable):
            return none_query()

        query['bool']['must'].append({
            "terms": {"schema": matchable}
        })

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['must'].append({
            'terms': {'collection_id': collection_ids}
        })

    required = []
    for name in proxy.names:
        required.append({
            'match': {
                'names.text': {
                    'query': name,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                }
            }
        })
        fp = fingerprints.generate(name)
        if fp is not None:
            required.append({
                'match': {
                    'fingerprints': {
                        'query': fp,
                        'fuzziness': 1,
                        'operator': 'and',
                        'boost': 3.0
                    }
                }
            })

    for type_ in registry.types:
        if not type_.strong or type_.group is None:
            continue
        for value in proxy.get_type_values(type_):
            required.append({
                'term': {
                    type_.group: {
                        'value': value,
                        'boost': 3.0
                    }
                }
            })

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        "bool": {
            "should": required,
            "minimum_should_match": 1
        }
    })
    return query