Esempio n. 1
0
    def get_tfidf_model(self):
        corpus = []
        for data in self.__data:
            queue = list(data['tree'].get('children', []))
            while queue:
                node = queue.pop(0)
                if 'text' in node:
                    corpus.append(node['text'])
                if 'children' in node:
                    queue += node['children']

        return get_tfidf_model(corpus)
Esempio n. 2
0
def _perform_search(query_text, url_style):
    ''' Takes a query string and finds the best matching document in the
    database. '''

    logging.info('Pre expansion: {}'.format(query_text))

    # Perform simple query expansion on the original query.
    query = expand_query(query_text)

    logging.info('Post expansion: {}'.format(query))

    # Retrieve a set of documents using MongoDB. We then attempt to filter
    # these further.
    docs = factory.get_document(query)

    # Prevent generating an empty corpus if no documents were found.
    if not docs:
        return _handle_not_found(query_text)

    # Create a corpus on the results from the MongoDB query.
    corpus = [_get_corpus_text(doc) for doc in docs]

    # Create a TF-IDF model on the corpus.
    vectorizer, corpus_matrix, feature_names = get_tfidf_model(corpus)

    try:
        # Compare the search query with all documents in our new model using
        # cosine similarity.
        scores = cosine_similarity(vectorizer.transform([query]),
                                   corpus_matrix)[0].tolist()

        sorted_scores = sorted(scores, reverse=True)

        # This could be calculated using the mean of all scores and the
        # standard deviation.
        if sorted_scores[0] < 0.05:
            return _handle_not_found(query_text)

        # Allow returning multiple answers if they rank very similarly.
        answers = []

        for score in sorted_scores:
            # Tolerance for similarity between scores.
            if sorted_scores[0] - score > 0.1:
                break

            # Add this result to the list of answers.
            answers.append(_get_answer(docs[scores.index(score)]))

        if len(answers) == 1:
            # Return the answer straight away if there is only 1 result/
            return _format_answer(answers[0], url_style)

        # Append answers until we reach the CHAR_LIMIT
        i, n_chars = 0, 0
        while n_chars < CHAR_LIMIT and i < len(answers):
            n_chars += len(answers[i])
            i += 1

        # If we only have 1 answer after threshold we don't want to add the
        # MULTI_ANSWERS option to the response
        if max(i, 1) == 1:
            return _format_answer(answers[0], url_style)

        # Join the results with a separator. Still setting a max number of
        # answers
        answers = answers[0:min(max(
            i,
            1,
        ), MAX_ANSWERS)]
        answers = [_format_answer(ans, url_style) for ans in answers]
        return '\n\n---\n\n'.join([MULTIPLE_ANSWERS] + answers)
    except KeyError:
        raise Exception('Document does not have content and texts.')
    except ValueError:
        return _handle_not_found(query_text)