Example #1
0
def similar(id):
    entity, _ = get_entity(id, request.authz.READ)
    schema = schemata.get(entity.get('schema'))
    if not schema.fuzzy:
        return jsonify({'status': 'ignore', 'results': [], 'total': 0})
    state = QueryState(request.args, request.authz)
    combined = combined_entity(entity)
    return jsonify(similar_entities(combined, state))
Example #2
0
 def expand(self, keys):
     labels = {}
     for key in keys:
         try:
             labels[key] = {'label': schemata.get(key).plural}
         except NameError:
             labels[key] = {'label': key}
     return labels
Example #3
0
    def __init__(self, query, data):
        self.query = query
        self.data = data
        self.keys = dict_list(data, 'keys', 'key')
        self.key_fingerprint = data.get('key_fingerprint', False)

        self.schema = schemata.get(data.get('schema'))
        if self.schema is None or self.schema.section != self.section:
            raise TypeError("Invalid schema: %r" % data.get('schema'))

        self.properties = []
        for name, prop in data.get('properties', {}).items():
            schema = self.schema.get(name)
            self.properties.append(MapperProperty(self, name, prop, schema))
Example #4
0
 def schema(self):
     return schemata.get(self.type)
Example #5
0
def entity_query(sample, collection_id=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""

    # Do not attempt to find xrefs for entity types such as land, buildings,
    # etc.
    schema = schemata.get(sample.get('schema'))
    if sample.get('schema') != Document.SCHEMA and not schema.fuzzy:
        return {'match_none': {}}

    if query is None:
        query = {
            'bool': {
                'should': [],
                'filter': [],
                'must': [],
                'must_not': []
            }
        }
    required = []

    if collection_id is not None:
        query['bool']['must'].append(
            {'term': {
                'collection_id': collection_id
            }})

    for fp in sample.get('fingerprints', []):
        required.append({
            'fuzzy': {
                'fingerprints': {
                    'value': fp,
                    'fuzziness': 2,
                    'boost': 3.0
                }
            }
        })

    # TODO: put names in FIELDS_XREF up there ^^^
    # for value in sample.get('names', []):
    #     required.append({
    #         'match': {
    #             'names': {
    #                 'query': value,
    #                 'operator': 'and',
    #                 'cutoff_frequency': 0.01,
    #             }
    #         }
    #     })

    for index in ['emails', 'phones']:
        for value in sample.get(index, []):
            required.append({'term': {index: {'value': value, 'boost': 2}}})

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return {'match_none': {}}

    # make it mandatory to have either a fingerprint or name match
    query['bool']['must'].append(
        {"bool": {
            "should": required,
            "minimum_should_match": 1
        }})

    # boost by "contributing criteria"
    for field in ['dates', 'countries', 'schemata', 'identifiers']:
        for val in sample.get(field, []):
            query['bool']['should'].append({'term': {field: val}})

    for val in sample.get('addresses', []):
        query['bool']['should'].append({'common': {field: {'query': val}}})

    # filter types which cannot be resolved via fuzzy matching.
    query['bool']['must_not'].append([{
        "ids": {
            "values": [sample.get('id')]
        }
    }, {
        "terms": {
            "schema": [s.name for s in schemata if not s.fuzzy]
        }
    }])
    return query
Example #6
0
 def update(self, result, key):
     key = result.get('id')
     try:
         result['label'] = schemata.get(key).plural
     except NameError:
         result['label'] = key
Example #7
0
 def _validate(self, value):
     try:
         schemata.get(value)
     except TypeError:
         raise ValidationError('Invalid schema name.')
Example #8
0
def generate_matches_sheet(workbook,
                           sheet,
                           collection,
                           match_collection,
                           authz,
                           links=True,
                           one_sheet=False,
                           offset=0,
                           limit=1000):
    from aleph.views.serializers import MatchSchema

    if one_sheet:
        sheet_label = "All matches (top %s per collection)" % limit
    else:
        sheet_label = "%s (top %s)" % (match_collection.label, limit)

    sheet.set_zoom(125)
    parser = QueryParser({}, authz, limit=limit)
    q_match = Match.find_by_collection(collection.id, match_collection.id)
    matches = MatchQueryResult({}, q_match, parser=parser, schema=MatchSchema)

    if offset < 3:

        sheet.write(0, 0, '', workbook.header_format)
        sheet.write(1, 0, 'Score', workbook.header_format)
        sheet.merge_range(0, 1, 0, 4, collection.label, workbook.header_format)
        sheet.write(1, 1, 'Name', workbook.header_format)
        sheet.write(1, 2, 'Type', workbook.header_format)
        sheet.write(1, 3, 'Country', workbook.header_format)
        sheet.write(1, 4, 'Source URL', workbook.header_format)
        sheet.merge_range(0, 5, 0, 8, sheet_label, workbook.header_format)
        sheet.write(1, 5, 'Name', workbook.header_format)
        sheet.write(1, 6, 'Type', workbook.header_format)
        sheet.write(1, 7, 'Country', workbook.header_format)
        if one_sheet:
            sheet.write(1, 8, 'Collection', workbook.header_format)

        sheet.freeze_panes(2, 0)
        sheet.autofilter(1, 1, 2 + len(matches.results), 8)

    widths = {}
    for row, result in enumerate(matches.results, offset):
        sheet.write_number(row, 0, int(result.score))
        name = result.entity.get('name')
        widths[1] = max(widths.get(1, 0), len(name))
        if links:
            url = entity_url(result.entity_id)
            sheet.write_url(row, 1, url, workbook.link_format, name)
        else:
            sheet.write_string(row, 1, name)
        schema = schemata.get(result.entity['schema'])
        sheet.write_string(row, 2, schema.label)
        countries = ', '.join(sorted(result.entity.get('countries', [])))
        sheet.write_string(row, 3, countries.upper())
        ent_props = result.entity.get('properties', {})
        if (ent_props.get('sourceUrl') is not None):
            source_url = ', '.join(ent_props.get('sourceUrl'))
        else:
            source_url = ''
        sheet.write_string(row, 4, source_url)

        name = result.match.get('name')
        widths[5] = max(widths.get(5, 0), len(name))
        if links:
            url = entity_url(result.match_id)
            sheet.write_url(row, 5, url, workbook.link_format, name)
        else:
            sheet.write_string(row, 5, name)
        schema = schemata.get(result.match['schema'])
        sheet.write_string(row, 6, schema.label)
        countries = ', '.join(sorted(result.match.get('countries', [])))
        sheet.write_string(row, 7, countries.upper())
        if one_sheet:
            sheet.write_string(row, 8, match_collection.label)

    for idx, max_len in widths.items():
        max_len = min(70, max(7, max_len + 1))
        sheet.set_column(idx, idx, float(max_len))

    return sheet