Beispiel #1
0
    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request, query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=es_index, doc_type=TYPE_ENTITY, body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)
Beispiel #2
0
def mget_safe(*args, **kwargs):
    # This is not supposed to be used in every location where search is
    # run, but only where it's a backend search that we could back off of
    # without hurting UX.
    for attempt in count():
        try:
            return es.mget(*args, **kwargs)
        except Exception as exc:
            log.warning("Search error: %s", exc)
        backoff_cluster(failures=attempt)
Beispiel #3
0
def _index_updates(entities, links):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    if not len(entities):
        return

    result = es.mget(index=es_index,
                     doc_type=TYPE_ENTITY,
                     body={'ids': entities.keys()})
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = merge_docs(entity, existing)
        combined['schema'] = schemata.merge_entity_schema(
            entity['schema'], existing['schema'])
        combined['roles'] = entity.get('roles', [])
        entities[entity_id] = combined

    for link in links:
        doc_id = link.pop('id', None)
        if doc_id is None:
            continue
        entity = entities.get(link.pop('remote'))
        if entity is None:
            continue
        entity = dict(entity)
        link['text'].extend(entity.pop('text', []))
        link['text'] = list(set(link['text']))
        link['remote'] = entity
        yield {
            '_id': doc_id,
            '_type': TYPE_LINK,
            '_index': str(es_index),
            '_source': link
        }

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        # from pprint import pprint
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_type': TYPE_ENTITY,
            '_index': str(es_index),
            '_source': entity
        }
Beispiel #4
0
    def _resolve_index(self, cache):
        query = []
        for (type_, id_) in cache.keys():
            if type_ == Role:
                continue
            query.append({'_index': type_, '_id': id_})

        if not len(query):
            return

        results = es.mget(body={'docs': query}, _source_exclude=['text'])
        for doc in results['docs']:
            cache[(doc['_index'], doc['_id'])] = unpack_result(doc)
Beispiel #5
0
def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        '$bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    result = es.mget(index=es_index,
                     doc_type=TYPE_ENTITY,
                     body={'ids': entities.keys()},
                     _source=['schema', 'properties', 'created_at'])
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.pop('data', None)
        entity.update(common)
        if 'created_at' not in entity:
            entity['created_at'] = entity.get('updated_at')
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema)
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_type': TYPE_ENTITY,
            '_index': str(es_index),
            '_source': entity
        }
Beispiel #6
0
    def _resolve_index(self, cache):
        queries = OrderedDict()
        for (type_, id_) in cache.keys():
            if type_ in [Collection]:
                index = collections_index()
                queries[(type_, id_)] = {'_index': index, '_id': id_}
            elif type_ in [Document, Entity]:
                index = entities_index()
                queries[(type_, id_)] = {'_index': index, '_id': id_}

        if not len(queries):
            return

        results = es.mget(body={'docs': queries.values()},
                          _source_exclude=['text'])
        for key, doc in zip(queries.keys(), results['docs']):
            cache[key] = unpack_result(doc)
    def _resolve_index(self, cache):
        queries = []
        for (type_, id_) in cache.keys():
            if type_ in [Collection]:
                index = collections_index()
                query = {'_index': index, '_id': id_}
                queries.append(((type_, id_), query))
            elif type_ in [Document, Entity]:
                for index in entities_index_list():
                    query = {'_index': index, '_id': id_}
                    queries.append(((type_, id_), query))

        if not len(queries):
            return

        results = es.mget(body={'docs': [q[1] for q in queries]},
                          _source_exclude=['text'])
        for (key, _), doc in zip(queries, results['docs']):
            if cache.get(key) is None:
                cache[key] = unpack_result(doc)