Example #1
0
    def doc_vect(self, result_docs):
        """
        takes the result set of documents and creates a vector representation out of it.
        INPUT
            a set of documents, [doc1, doc32, ...]
        OUTPUT
            vector space representation of each [doc1 => [0.3, 0.11, 0.01, ...], doc2 => [0.001, 0.08, ...]]
        """
        start_time = begin_time(None)

        temp_dict = {}
        unique_words = self.inverted_index.keys()

        # since the tf-idf scores are all precomputed during index time,
        # this is just fetching the scores as needed.

        for doc in result_docs:
            # the vector has as many dimensions as number of unique words/tokens in the corpus
            vectorised_doc = [0] * len(unique_words)
            try:
                ind = 0
                for term in unique_words:
                    vectorised_doc[ind] = self.built_index.get_tfidf_scores(
                        term, doc)
                    ind += 1

                # hash map it
                temp_dict[doc] = vectorised_doc

            except Exception as ex:
                raise Exception("Exception while vectorising", ex)

        # end_time("only docs ", start_time)

        return temp_dict
Example #2
0
    def rank_results(self, result_set, query_terms):
        """
        ranking algorithm. Basically matches two vectorised representation of the query and the resultant document list
        """
        start_time = begin_time(None)

        # Naive way: rank by frequency of occurrence in the document
        if Setup.fast_search:
            results = self.filtered_result_set(result_set, query_terms)

        # Vectorised by tf-idf and document similarity by vector dot product
        # this is slower than the former one
        else:
            # vectorize the result documents with tf-idf scores
            result_docs_vectorised = self.doc_vect(result_set)

            # vectorize the query terms with tf-idf again
            query_vectorised = self.query_vect(query_terms)

            # find the cosine similarity between result vectors and query vector
            results = [[
                self.dot_product(result_docs_vectorised[result],
                                 query_vectorised), result
            ] for result in result_set]

            # sort by descending similarity values
            results.sort(key=lambda x: x[0], reverse=True)

            # grab the document ids
            results = [x[1] for x in results]

        end_time("Ranking", start_time)

        return results
Example #3
0
    def build_id_to_tokens_dict(self, file_df):
        """
        # INPUT
            pandas dataframe of size |total documents|  X 3 (id, title and body)
          OUTPUT
            doc1 => [w11, w12, w13,....],
            doc2 => [w21, w22, w23,....],
        """
        start_time = begin_time("Tokenising the documents")

        for index, row in file_df.iterrows():
            # key is the document id,
            # value is the list of words in that document
            content = re.sub("[^\w]", " ", row['doc_body'].lower())

            # stemmed and stop words removed
            clean_content = [
                self.stemmer.stem(word.rstrip().lstrip())
                for word in content.lower().split()
                if word not in self.cached_stop_words and len(word) > 0
            ]
            self.id_tokens_map[row['doc_id']] = clean_content

            # simultaneously maintain an id to title mapping for results display
            self.id_titles_map[row['doc_id']] = row['doc_title']

            perc_completed = 100 * index / float(len(file_df))
            if perc_completed > Setup.data_set_limit:
                print "indexed {} documents".format(index)
                break

        end_time("Tokenising the documents", start_time)
Example #4
0
    def create_inverted_index(self, input_file):
        """
        The main function for inverted indexing. It calls a set of sub-routines to achieve this
        """
        # create a data frame
        file_df = pd.read_csv(input_file,
                              sep='\t',
                              names=["doc_id", "doc_title", "doc_body"])

        start_time = begin_time("Inverted Index Building")

        # first create id to tokens dictionary
        self.build_id_to_tokens_dict(file_df)

        # use that to find which tokens occur in which documents
        self.make_indices(self.id_tokens_map)

        # keep in memory the tf, idf scores
        self.generate_all_tfidf()

        end_time("Inverted Index Building", start_time)
Example #5
0
    def search(self, phrase):
        """
        Generic search function. Splits query phrases and retrieves individual lists.
        """
        start_time = begin_time("Document search")

        query_terms = re.sub("[^\w]", " ", phrase).lower()
        result = []
        formatted_query = []

        for term in query_terms.split():
            # remove stopwords from query
            if term not in self.built_index.cached_stop_words:
                # stem words
                term = self.built_index.stemmer.stem(term)
                formatted_query.append(term)
                result += self.single_term_query(term)

        # get the duplicate ones, meaning, multiple query terms share those documents
        # for 3 term query we want the count of duplicates to be > 2
        limit = max(1, len(formatted_query) - 1)
        intersection = set([x for x in result if result.count(x) > limit])

        end_time("Document search", start_time)

        query_terms = ' '.join(formatted_query)

        if len(intersection) == 0:
            if len(query_terms.split()) <= 1:  # phrase query
                self.results = self.rank_results(result, query_terms)
        else:
            self.results = self.rank_results(list(intersection), query_terms)

        # fancy printing
        print "Search Results:\n--------------"
        cnt = 0
        while cnt < min(Setup.top_k_results, len(self.results)):
            result = self.results[cnt]
            print "{}\t{}".format(result, self.titles_map[result])
            cnt += 1
Example #6
0
    def generate_all_tfidf(self):
        """
        generating a tf-idf score and pre-populating it
        for each unique word in each document.
        We use here the tf and df to compute the score
        """

        start_time = begin_time("tf-idf score computation")

        for term in self.complete_inverted_index:
            try:
                if term in self.df:
                    self.idf[term] = self.get_idf_score(
                        len(self.id_titles_map), self.df[term])
                else:
                    self.idf[term] = 0
            except Exception as ex:
                raise Exception("Exception in tf-idf", ex)

        end_time("tf-idf score computation", start_time)

        return self.df, self.tf, self.idf