Esempi in Python per Index.calculate_tfidf

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: index

Classe/tipologia: Index

Metodo/funzione: calculate_tfidf

Esempi su hotexamples.com: 1

Index.calculate_tfidf in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per index.Index.calculate_tfidf, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Index(17)

add_document(11)

add(8)

PUT_SCHEMA(3)

add_entry(3)

add_index_range(2)

read_index(2)

exists(2)

open_or_create(2)

get_status(1)

get_term(1)

get_or_create_instance(1)

get_net_interface(1)

get_keywords(1)

index_media(1)

index_object(1)

is_duplicate(1)

CreateIndex(1)

name(1)

post_syslog(1)

get_document_vector(1)

put_status(1)

remove_word(1)

rm_data(1)

status(1)

storeIndex(1)

train_path(1)

update_md5s(1)

verify(1)

get_items_generator(1)

getParserType(1)

get_data_by_id(1)

add_key(1)

SearchIndex(1)

__init__(1)

_fields(1)

_kw(1)

addTask(1)

add_data(1)

add_doc(1)

add_downloader(1)

add_index(1)

add_word(1)

get_all(1)

agenda(1)

append(1)

articles(1)

by_prefix(1)

calculate_tfidf(1)

construct_index(1)

Esempio n. 1

Mostra file

File: search_engine.py Progetto: tarekmehrez/Search-Index

class SearchEngine(object):
    """
    Create a search engine.

    - Build an index on the given knowledge base
    - Tokenize query
    - Get intersection of postings
    - Return search results accoridngly
    """

    def __init__(self):
        """
        Initialize a search engine instance.
        """
        self.index = Index()

    def build_index(self, path_to_knwoledge_base, path_to_index_dir):
        """
        Take in file with articles, create an index instance.

        Args:
            path_to_knwoledge_base(str)
            path_to_index(str)
        """
        # in case index is found, load it
        if io.exists(path_to_index_dir):
            _logger.info('Index located at %s already exists', path_to_index_dir)
            self.load_index(path_to_index_dir)
            return

        _logger.info('Creating index from knowledge base %s', path_to_knwoledge_base)

        # otherwise, create it
        raw_content = io.read(path_to_knwoledge_base)

        _logger.debug('Creating postings')
        self.index.build_index(raw_content)
        _logger.debug('Calculating tfidf')
        self.index.calculate_tfidf()
        _logger.debug('Writing index')
        self.index.save(path_to_index_dir)
        _logger.debug('Done writing index')

    def load_index(self, path_to_index_dir):
        """
        Load index instance.
        """
        _logger.debug('Loading index from %s', path_to_index_dir)
        self._index = self.index.load(path_to_index_dir)
        _logger.debug('Done loading index')

    def search(self, query, num_of_results):
        """
        Run the search engine for a given query.

        Args:
            query (str)
            num_of_results (int): number of results to be returned
        Returns
            list[list[str]]: results as article ids and titles
        """
        if self.index is None:
            raise IndexNotLoadedException('You need to create or load index first')

        tokens = preprocessing.tokenize(query)

        frequencies = preprocessing.count_frequency(tokens)
        articles = self._postings_intersections(frequencies.keys())

        if len(articles) == 0:
            return []

        ranked_scores = self._rank(frequencies, articles)

        article_ids = ranked_scores.keys()
        titles = [self.index.articles[article_id].title
                  for article_id in article_ids]

        results = zip(article_ids, titles)

        return results[:num_of_results]

    def _postings_intersections(self, tokens):
        """
        Return intersection of postings for given tokens.

        Args:
            tokens (list[str])
        Returns:
            dict{str, str}: article ids and their titles
        """
        # get article intersection for all tokens
        set_list = [self.index.postings[token] for token in tokens]
        intersection = set.intersection(*set_list)

        # get their titles
        result = {}
        for article_id in intersection:
            result[article_id] = self.index.articles[article_id].title

        return result

    def _rank(self, frequencies, articles):
        """
        Rank returned search results.

        Args:
            frequencies (dict{str:int})
            articles (dict{str, str}): article ids and their titles

        Returns:
            OrderedDict
        """
        # lookup tfidf values of returned articles
        returned_articles_tfidf = {}
        for article_id in articles:
            returned_articles_tfidf[
                article_id] = self.index.articles[article_id].tfidf

        # calculate tfidf of current query
        query_tfidf = {}
        for token, frequency in frequencies.iteritems():
            query_tfidf[token] = float(
                frequency) * float(self.index.token_idf[token])

        # calculate cosine similarities
        similarity_scores = {}

        for article_id in returned_articles_tfidf:
            current_score = helpers.cosine_similarity(
                query_tfidf, returned_articles_tfidf[article_id])

            similarity_scores[article_id] = current_score

        # sort according to scores ~ rank search results
        sorted_scores = sorted(similarity_scores.items(),
                               key=operator.itemgetter(1))

        return OrderedDict(sorted_scores)