Ejemplo n.º 1
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    state = QueryState({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [make_fingerprint(name)],
        'schemata': ensure_list(query.get('type'))
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    suggested = similar_entities(entity, state)
    matches = []
    for ent in suggested.get('results'):
        types = [t for t in get_freebase_types() if ent['schema'] == t['id']]
        matches.append({
            'id': ent.get('id'),
            'name': ent.get('name'),
            'type': types,
            'score': min(100, ent.get('score') * 10),
            'uri': entity_link(ent.get('id')),
            'match': ent.get('name') == name
        })
    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
Ejemplo n.º 2
0
Archivo: authz.py Proyecto: wcyn/aleph
 def check_roles(self, roles):
     # if self.in_maintenance:
     #     return False
     if self.is_admin:
         return True
     isect = self.roles.intersection(ensure_list(roles))
     return len(isect) > 0
Ejemplo n.º 3
0
def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        '$bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=es_index, doc_type=TYPE_ENTITY, id=entity.id, body=data)
    data['id'] = entity.id
    data['$type'] = TYPE_ENTITY
    return data
Ejemplo n.º 4
0
def finalize_index(data, schema):
    """Apply final denormalisations to the index."""
    properties = data.get('properties', {})

    texts = []
    for prop in schema.properties:
        if prop.name not in properties:
            continue
        if prop.type_name in ['date', 'url', 'uri', 'country']:
            continue
        texts.extend(ensure_list(properties[prop.name]))

    data['text'] = index_form(texts)
    data = schema.invert(data)
    index_names(data)
    data['schema'] = schema.name
    # Get implied schemata (i.e. parents of the actual schema)
    data['schemata'] = schema.names

    # Second name field for non-tokenised sorting.
    if 'name' in data:
        data['name_sort'] = data.get('name')

    # pprint(data)
    return data
Ejemplo n.º 5
0
 def to_index(self):
     entity = self.to_dict()
     entity['properties'] = {'name': [self.name]}
     for k, v in self.data.items():
         v = ensure_list(v)
         if len(v):
             entity['properties'][k] = v
     return entity
Ejemplo n.º 6
0
 def dump(self, data, many=False):
     results = []
     for res in ensure_list(data):
         schema = self.SCHEMATA[res['$doc_type']]
         res = schema().dump(res)
         if not many:
             return res
         results.append(res.data)
     return results, []
Ejemplo n.º 7
0
def finalize_index(data, schema):
    """Apply final denormalisations to the index."""
    properties = data.get('properties', {})

    texts = []
    for vs in properties.values():
        for v in ensure_list(vs):
            texts.append(v)

    data['text'] = index_form(texts)
    data['fingerprints'] = data.get('fingerprints', [])

    # Generate inverted representations of the data stored in properties.
    for prop in schema.properties:
        values = properties.get(prop.name, [])
        if not len(values):
            continue

        # Find an set the name property
        if prop.is_label:
            data['name'] = values[0]

        # Generate key material
        # TODO: this should probably be record-based.
        data['fingerprints'].extend(prop.type.fingerprint(values))

        # Add inverted properties. This takes all the properties
        # of a specific type (names, dates, emails etc.)
        invert = prop.type.index_invert
        if invert:
            if invert not in data:
                data[invert] = []
            for norm in prop.type.normalize(values):
                if norm not in data[invert]:
                    data[invert].append(norm)

    data['fingerprints'] = list(set(data['fingerprints']))

    # Add latinised names
    names = data.get('names', [])
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))

    # Get implied schemata (i.e. parents of the actual schema)
    data['schema'] = schema.name
    data['schemata'] = []
    for parent in schema.schemata:
        if not parent.hidden:
            data['schemata'].append(parent.name)

    # Second name field for non-tokenised sorting.
    if 'name' in data:
        data['name_sort'] = data.get('name')
    return data
Ejemplo n.º 8
0
 def dump(self, data, many=False):
     results = []
     for res in ensure_list(data):
         if res.get('schema') == Document.SCHEMA:
             res = DocumentSchema().dump(res)
         else:
             res = EntitySchema().dump(res)
         if not many:
             return res
         results.append(res.data)
     return results, []
Ejemplo n.º 9
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or 'Thing'
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(schema),
        'schema': schema
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    query = SimilarEntitiesQuery(parser, entity=entity)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        source = doc.get('_source')
        match = {
            'id': doc.get('_id'),
            'name': source.get('name'),
            'score': min(100, doc.get('_score') * 10),
            'uri': entity_url(doc.get('_id')),
            'match': source.get('name') == name
        }
        for type_ in get_freebase_types():
            if source['schema'] == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        'query': {
            'bool': {
                'should': shoulds,
                'minimum_should_match': 1
            }
        },
        'sort': [{'document_id': 'desc'}],
        '_source': ['document_id', 'text']
    }
    for res in scan(es, query=query, index=es_index, doc_type=[TYPE_RECORD]):
        for text in ensure_list(res.get('_source').get('text')):
            yield (res.get('_source').get('document_id'), text)
Ejemplo n.º 11
0
def get_languages_iso3(codes):
    """Turn (pre-set) ISO2 language codes into ISO3 codes."""
    supported = []
    for lang in ensure_list(codes):
        if lang is None or len(lang.strip()) not in [2, 3]:
            continue
        lang = lang.lower().strip()
        if len(lang) == 2:
            try:
                c = languages.get(alpha_2=lang)
                lang = c.alpha_3
            except KeyError as ke:
                log.exception(ke)
                continue
        supported.append(lang)

    # if not len(supported):
    supported.append('eng')
    return '+'.join(sorted(set(supported)))
Ejemplo n.º 12
0
def finalize_index(data, schema):
    """Apply final denormalisations to the index."""
    properties = data.get('properties', {})

    texts = []
    for vs in properties.values():
        for v in ensure_list(vs):
            texts.append(v)

    data['text'] = index_form(texts)

    # Generate inverted representations of the data stored in properties.
    for prop in schema.properties:
        values = properties.get(prop.name, [])
        if not len(values):
            continue

        # Find an set the name property
        if prop.is_label:
            data['name'] = values[0]

        # Add inverted properties. This takes all the properties
        # of a specific type (names, dates, emails etc.)
        invert = prop.type.index_invert
        if invert:
            if invert not in data:
                data[invert] = []
            for norm in prop.type.normalize(values):
                if norm not in data[invert]:
                    data[invert].append(norm)

    index_names(data)

    # Get implied schemata (i.e. parents of the actual schema)
    data['schema'] = schema.name
    data['schemata'] = [p.name for p in schema.schemata if not p.hidden]

    # Second name field for non-tokenised sorting.
    if 'name' in data:
        data['name_sort'] = data.get('name')

    # pprint(data)
    return data
Ejemplo n.º 13
0
    def validate(self, data):
        """Validate that the data should be stored.

        Since the types system doesn't really have validation, this currently
        tries to normalize the value to see if it passes strict parsing.
        """
        value, error = [], None
        for val in ensure_list(data):
            val = string_value(val)
            if val is None:
                continue
            val = val.strip()
            if self.type.normalize_value(val) is None:
                error = "Invalid value"
            value.append(val)
        if not self.is_multiple:
            value = value[0] if len(value) else None
        else:
            value = list(set(value))
        if self.is_label and (value is None or not len(value)):
            error = "Field is required."
        return value, error
Ejemplo n.º 14
0
 def terms(self):
     terms = set([self.name])
     for alias in ensure_list(self.data.get('alias')):
         if alias is not None and len(alias):
             terms.add(alias)
     return terms
Ejemplo n.º 15
0
 def check_roles(self, roles):
     if self.is_admin:
         return True
     isect = self.roles.intersection(ensure_list(roles))
     return len(isect) > 0
Ejemplo n.º 16
0
 def normalize(self, values):
     results = set()
     for value in values:
         results.update(ensure_list(self.normalize_value(value)))
     return results