def reconcile_op(query): """Reconcile operation for a single query.""" state = QueryState({ 'limit': query.get('limit', '5'), 'strict': 'false' }, request.authz) name = query.get('query', '') entity = { 'id': 'fake', 'names': [name], 'fingerprints': [make_fingerprint(name)], 'schemata': ensure_list(query.get('type')) } for p in query.get('properties', []): entity[p.get('pid')] = ensure_list(p.get('v')) suggested = similar_entities(entity, state) matches = [] for ent in suggested.get('results'): types = [t for t in get_freebase_types() if ent['schema'] == t['id']] matches.append({ 'id': ent.get('id'), 'name': ent.get('name'), 'type': types, 'score': min(100, ent.get('score') * 10), 'uri': entity_link(ent.get('id')), 'match': ent.get('name') == name }) log.info("Reconciled: %r -> %d matches", name, len(matches)) return { 'result': matches, 'num': len(matches) }
def check_roles(self, roles): # if self.in_maintenance: # return False if self.is_admin: return True isect = self.roles.intersection(ensure_list(roles)) return len(isect) > 0
def index_entity(entity): """Index an entity.""" if entity.deleted_at is not None: return delete_entity(entity.id) data = { 'foreign_ids': entity.foreign_ids, 'data': entity.data, 'created_at': entity.created_at, 'updated_at': entity.updated_at, '$bulk': False, 'roles': entity.collection.roles, 'collection_id': entity.collection_id, 'properties': { 'name': [entity.name] } } for k, v in entity.data.items(): data['properties'][k] = ensure_list(v) # data['$documents'] = get_count(entity) data = finalize_index(data, entity.schema) es.index(index=es_index, doc_type=TYPE_ENTITY, id=entity.id, body=data) data['id'] = entity.id data['$type'] = TYPE_ENTITY return data
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for prop in schema.properties: if prop.name not in properties: continue if prop.type_name in ['date', 'url', 'uri', 'country']: continue texts.extend(ensure_list(properties[prop.name])) data['text'] = index_form(texts) data = schema.invert(data) index_names(data) data['schema'] = schema.name # Get implied schemata (i.e. parents of the actual schema) data['schemata'] = schema.names # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') # pprint(data) return data
def to_index(self): entity = self.to_dict() entity['properties'] = {'name': [self.name]} for k, v in self.data.items(): v = ensure_list(v) if len(v): entity['properties'][k] = v return entity
def dump(self, data, many=False): results = [] for res in ensure_list(data): schema = self.SCHEMATA[res['$doc_type']] res = schema().dump(res) if not many: return res results.append(res.data) return results, []
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for vs in properties.values(): for v in ensure_list(vs): texts.append(v) data['text'] = index_form(texts) data['fingerprints'] = data.get('fingerprints', []) # Generate inverted representations of the data stored in properties. for prop in schema.properties: values = properties.get(prop.name, []) if not len(values): continue # Find an set the name property if prop.is_label: data['name'] = values[0] # Generate key material # TODO: this should probably be record-based. data['fingerprints'].extend(prop.type.fingerprint(values)) # Add inverted properties. This takes all the properties # of a specific type (names, dates, emails etc.) invert = prop.type.index_invert if invert: if invert not in data: data[invert] = [] for norm in prop.type.normalize(values): if norm not in data[invert]: data[invert].append(norm) data['fingerprints'] = list(set(data['fingerprints'])) # Add latinised names names = data.get('names', []) for name in list(names): names.append(ascii_text(name)) data['names'] = list(set(names)) # Get implied schemata (i.e. parents of the actual schema) data['schema'] = schema.name data['schemata'] = [] for parent in schema.schemata: if not parent.hidden: data['schemata'].append(parent.name) # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') return data
def dump(self, data, many=False): results = [] for res in ensure_list(data): if res.get('schema') == Document.SCHEMA: res = DocumentSchema().dump(res) else: res = EntitySchema().dump(res) if not many: return res results.append(res.data) return results, []
def reconcile_op(query): """Reconcile operation for a single query.""" parser = SearchQueryParser({ 'limit': query.get('limit', '5'), 'strict': 'false' }, request.authz) name = query.get('query', '') schema = query.get('type') or 'Thing' entity = { 'id': 'fake', 'names': [name], 'fingerprints': [fingerprints.generate(name)], 'schemata': ensure_list(schema), 'schema': schema } for p in query.get('properties', []): entity[p.get('pid')] = ensure_list(p.get('v')) query = SimilarEntitiesQuery(parser, entity=entity) matches = [] for doc in query.search().get('hits').get('hits'): source = doc.get('_source') match = { 'id': doc.get('_id'), 'name': source.get('name'), 'score': min(100, doc.get('_score') * 10), 'uri': entity_url(doc.get('_id')), 'match': source.get('name') == name } for type_ in get_freebase_types(): if source['schema'] == type_['id']: match['type'] = [type_] matches.append(match) log.info("Reconciled: %r -> %d matches", name, len(matches)) return { 'result': matches, 'num': len(matches) }
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { 'query': { 'bool': { 'should': shoulds, 'minimum_should_match': 1 } }, 'sort': [{'document_id': 'desc'}], '_source': ['document_id', 'text'] } for res in scan(es, query=query, index=es_index, doc_type=[TYPE_RECORD]): for text in ensure_list(res.get('_source').get('text')): yield (res.get('_source').get('document_id'), text)
def get_languages_iso3(codes): """Turn (pre-set) ISO2 language codes into ISO3 codes.""" supported = [] for lang in ensure_list(codes): if lang is None or len(lang.strip()) not in [2, 3]: continue lang = lang.lower().strip() if len(lang) == 2: try: c = languages.get(alpha_2=lang) lang = c.alpha_3 except KeyError as ke: log.exception(ke) continue supported.append(lang) # if not len(supported): supported.append('eng') return '+'.join(sorted(set(supported)))
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for vs in properties.values(): for v in ensure_list(vs): texts.append(v) data['text'] = index_form(texts) # Generate inverted representations of the data stored in properties. for prop in schema.properties: values = properties.get(prop.name, []) if not len(values): continue # Find an set the name property if prop.is_label: data['name'] = values[0] # Add inverted properties. This takes all the properties # of a specific type (names, dates, emails etc.) invert = prop.type.index_invert if invert: if invert not in data: data[invert] = [] for norm in prop.type.normalize(values): if norm not in data[invert]: data[invert].append(norm) index_names(data) # Get implied schemata (i.e. parents of the actual schema) data['schema'] = schema.name data['schemata'] = [p.name for p in schema.schemata if not p.hidden] # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') # pprint(data) return data
def validate(self, data): """Validate that the data should be stored. Since the types system doesn't really have validation, this currently tries to normalize the value to see if it passes strict parsing. """ value, error = [], None for val in ensure_list(data): val = string_value(val) if val is None: continue val = val.strip() if self.type.normalize_value(val) is None: error = "Invalid value" value.append(val) if not self.is_multiple: value = value[0] if len(value) else None else: value = list(set(value)) if self.is_label and (value is None or not len(value)): error = "Field is required." return value, error
def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms
def check_roles(self, roles): if self.is_admin: return True isect = self.roles.intersection(ensure_list(roles)) return len(isect) > 0
def normalize(self, values): results = set() for value in values: results.update(ensure_list(self.normalize_value(value))) return results