def __init__(self, request, query, parser=None, schema=None): super(MatchQueryResult, self).__init__(request, query, parser=parser, schema=schema) ids = set() for match in self.results: ids.add(match.match_id) ids.add(match.entity_id) ids = {'ids': list(ids)} result = es.mget(index=es_index, doc_type=TYPE_ENTITY, body=ids) for doc in result.get('docs', []): entity = unpack_result(doc) if entity is None: continue for match in self.results: if match.match_id == entity['id']: match.match = entity if match.entity_id == entity['id']: match.entity = entity # Do not return results if the entity has been removed in the mean # time. Not sure this is the ideal way of doing this, as it'll mess # with pagination counts etc. for match in list(self.results): if not hasattr(match, 'match') or not hasattr(match, 'entity'): self.results.remove(match)
def mget_safe(*args, **kwargs): # This is not supposed to be used in every location where search is # run, but only where it's a backend search that we could back off of # without hurting UX. for attempt in count(): try: return es.mget(*args, **kwargs) except Exception as exc: log.warning("Search error: %s", exc) backoff_cluster(failures=attempt)
def _index_updates(entities, links): """Look up existing index documents and generate an updated form. This is necessary to make the index accumulative, i.e. if an entity or link gets indexed twice with different field values, it'll add up the different field values into a single record. This is to avoid overwriting the document and losing field values. An alternative solution would be to implement this in Groovy on the ES. """ if not len(entities): return result = es.mget(index=es_index, doc_type=TYPE_ENTITY, body={'ids': entities.keys()}) for doc in result.get('docs', []): if not doc.get('found', False): continue entity_id = doc['_id'] entity = entities.get(entity_id) existing = doc.get('_source') combined = merge_docs(entity, existing) combined['schema'] = schemata.merge_entity_schema( entity['schema'], existing['schema']) combined['roles'] = entity.get('roles', []) entities[entity_id] = combined for link in links: doc_id = link.pop('id', None) if doc_id is None: continue entity = entities.get(link.pop('remote')) if entity is None: continue entity = dict(entity) link['text'].extend(entity.pop('text', [])) link['text'] = list(set(link['text'])) link['remote'] = entity yield { '_id': doc_id, '_type': TYPE_LINK, '_index': str(es_index), '_source': link } for doc_id, entity in entities.items(): entity.pop('id', None) # from pprint import pprint # pprint(entity) yield { '_id': doc_id, '_type': TYPE_ENTITY, '_index': str(es_index), '_source': entity }
def _resolve_index(self, cache): query = [] for (type_, id_) in cache.keys(): if type_ == Role: continue query.append({'_index': type_, '_id': id_}) if not len(query): return results = es.mget(body={'docs': query}, _source_exclude=['text']) for doc in results['docs']: cache[(doc['_index'], doc['_id'])] = unpack_result(doc)
def _index_updates(collection, entities): """Look up existing index documents and generate an updated form. This is necessary to make the index accumulative, i.e. if an entity or link gets indexed twice with different field values, it'll add up the different field values into a single record. This is to avoid overwriting the document and losing field values. An alternative solution would be to implement this in Groovy on the ES. """ common = { 'collection_id': collection.id, '$bulk': True, 'roles': collection.roles, 'updated_at': datetime.utcnow() } if not len(entities): return result = es.mget(index=es_index, doc_type=TYPE_ENTITY, body={'ids': entities.keys()}, _source=['schema', 'properties', 'created_at']) for doc in result.get('docs', []): if not doc.get('found', False): continue entity_id = doc['_id'] entity = entities.get(entity_id) existing = doc.get('_source') combined = model.merge(existing, entity) combined['created_at'] = existing.get('created_at') entities[entity_id] = combined for doc_id, entity in entities.items(): entity.pop('id', None) entity.pop('data', None) entity.update(common) if 'created_at' not in entity: entity['created_at'] = entity.get('updated_at') schema = model.get(entity.get('schema')) entity = finalize_index(entity, schema) # pprint(entity) yield { '_id': doc_id, '_type': TYPE_ENTITY, '_index': str(es_index), '_source': entity }
def _resolve_index(self, cache): queries = OrderedDict() for (type_, id_) in cache.keys(): if type_ in [Collection]: index = collections_index() queries[(type_, id_)] = {'_index': index, '_id': id_} elif type_ in [Document, Entity]: index = entities_index() queries[(type_, id_)] = {'_index': index, '_id': id_} if not len(queries): return results = es.mget(body={'docs': queries.values()}, _source_exclude=['text']) for key, doc in zip(queries.keys(), results['docs']): cache[key] = unpack_result(doc)
def _resolve_index(self, cache): queries = [] for (type_, id_) in cache.keys(): if type_ in [Collection]: index = collections_index() query = {'_index': index, '_id': id_} queries.append(((type_, id_), query)) elif type_ in [Document, Entity]: for index in entities_index_list(): query = {'_index': index, '_id': id_} queries.append(((type_, id_), query)) if not len(queries): return results = es.mget(body={'docs': [q[1] for q in queries]}, _source_exclude=['text']) for (key, _), doc in zip(queries, results['docs']): if cache.get(key) is None: cache[key] = unpack_result(doc)