Esempio n. 1
0
class SearchEngine(object):
    """
    Create a search engine.

    - Build an index on the given knowledge base
    - Tokenize query
    - Get intersection of postings
    - Return search results accoridngly
    """

    def __init__(self):
        """
        Initialize a search engine instance.
        """
        self.index = Index()

    def build_index(self, path_to_knwoledge_base, path_to_index_dir):
        """
        Take in file with articles, create an index instance.

        Args:
            path_to_knwoledge_base(str)
            path_to_index(str)
        """
        # in case index is found, load it
        if io.exists(path_to_index_dir):
            _logger.info('Index located at %s already exists', path_to_index_dir)
            self.load_index(path_to_index_dir)
            return

        _logger.info('Creating index from knowledge base %s', path_to_knwoledge_base)

        # otherwise, create it
        raw_content = io.read(path_to_knwoledge_base)

        _logger.debug('Creating postings')
        self.index.build_index(raw_content)
        _logger.debug('Calculating tfidf')
        self.index.calculate_tfidf()
        _logger.debug('Writing index')
        self.index.save(path_to_index_dir)
        _logger.debug('Done writing index')

    def load_index(self, path_to_index_dir):
        """
        Load index instance.
        """
        _logger.debug('Loading index from %s', path_to_index_dir)
        self._index = self.index.load(path_to_index_dir)
        _logger.debug('Done loading index')

    def search(self, query, num_of_results):
        """
        Run the search engine for a given query.

        Args:
            query (str)
            num_of_results (int): number of results to be returned
        Returns
            list[list[str]]: results as article ids and titles
        """
        if self.index is None:
            raise IndexNotLoadedException('You need to create or load index first')

        tokens = preprocessing.tokenize(query)

        frequencies = preprocessing.count_frequency(tokens)
        articles = self._postings_intersections(frequencies.keys())

        if len(articles) == 0:
            return []

        ranked_scores = self._rank(frequencies, articles)

        article_ids = ranked_scores.keys()
        titles = [self.index.articles[article_id].title
                  for article_id in article_ids]

        results = zip(article_ids, titles)

        return results[:num_of_results]

    def _postings_intersections(self, tokens):
        """
        Return intersection of postings for given tokens.

        Args:
            tokens (list[str])
        Returns:
            dict{str, str}: article ids and their titles
        """
        # get article intersection for all tokens
        set_list = [self.index.postings[token] for token in tokens]
        intersection = set.intersection(*set_list)

        # get their titles
        result = {}
        for article_id in intersection:
            result[article_id] = self.index.articles[article_id].title

        return result

    def _rank(self, frequencies, articles):
        """
        Rank returned search results.

        Args:
            frequencies (dict{str:int})
            articles (dict{str, str}): article ids and their titles

        Returns:
            OrderedDict
        """
        # lookup tfidf values of returned articles
        returned_articles_tfidf = {}
        for article_id in articles:
            returned_articles_tfidf[
                article_id] = self.index.articles[article_id].tfidf

        # calculate tfidf of current query
        query_tfidf = {}
        for token, frequency in frequencies.iteritems():
            query_tfidf[token] = float(
                frequency) * float(self.index.token_idf[token])

        # calculate cosine similarities
        similarity_scores = {}

        for article_id in returned_articles_tfidf:
            current_score = helpers.cosine_similarity(
                query_tfidf, returned_articles_tfidf[article_id])

            similarity_scores[article_id] = current_score

        # sort according to scores ~ rank search results
        sorted_scores = sorted(similarity_scores.items(),
                               key=operator.itemgetter(1))

        return OrderedDict(sorted_scores)