def doc_vect(self, result_docs): """ takes the result set of documents and creates a vector representation out of it. INPUT a set of documents, [doc1, doc32, ...] OUTPUT vector space representation of each [doc1 => [0.3, 0.11, 0.01, ...], doc2 => [0.001, 0.08, ...]] """ start_time = begin_time(None) temp_dict = {} unique_words = self.inverted_index.keys() # since the tf-idf scores are all precomputed during index time, # this is just fetching the scores as needed. for doc in result_docs: # the vector has as many dimensions as number of unique words/tokens in the corpus vectorised_doc = [0] * len(unique_words) try: ind = 0 for term in unique_words: vectorised_doc[ind] = self.built_index.get_tfidf_scores( term, doc) ind += 1 # hash map it temp_dict[doc] = vectorised_doc except Exception as ex: raise Exception("Exception while vectorising", ex) # end_time("only docs ", start_time) return temp_dict
def rank_results(self, result_set, query_terms): """ ranking algorithm. Basically matches two vectorised representation of the query and the resultant document list """ start_time = begin_time(None) # Naive way: rank by frequency of occurrence in the document if Setup.fast_search: results = self.filtered_result_set(result_set, query_terms) # Vectorised by tf-idf and document similarity by vector dot product # this is slower than the former one else: # vectorize the result documents with tf-idf scores result_docs_vectorised = self.doc_vect(result_set) # vectorize the query terms with tf-idf again query_vectorised = self.query_vect(query_terms) # find the cosine similarity between result vectors and query vector results = [[ self.dot_product(result_docs_vectorised[result], query_vectorised), result ] for result in result_set] # sort by descending similarity values results.sort(key=lambda x: x[0], reverse=True) # grab the document ids results = [x[1] for x in results] end_time("Ranking", start_time) return results
def build_id_to_tokens_dict(self, file_df): """ # INPUT pandas dataframe of size |total documents| X 3 (id, title and body) OUTPUT doc1 => [w11, w12, w13,....], doc2 => [w21, w22, w23,....], """ start_time = begin_time("Tokenising the documents") for index, row in file_df.iterrows(): # key is the document id, # value is the list of words in that document content = re.sub("[^\w]", " ", row['doc_body'].lower()) # stemmed and stop words removed clean_content = [ self.stemmer.stem(word.rstrip().lstrip()) for word in content.lower().split() if word not in self.cached_stop_words and len(word) > 0 ] self.id_tokens_map[row['doc_id']] = clean_content # simultaneously maintain an id to title mapping for results display self.id_titles_map[row['doc_id']] = row['doc_title'] perc_completed = 100 * index / float(len(file_df)) if perc_completed > Setup.data_set_limit: print "indexed {} documents".format(index) break end_time("Tokenising the documents", start_time)
def create_inverted_index(self, input_file): """ The main function for inverted indexing. It calls a set of sub-routines to achieve this """ # create a data frame file_df = pd.read_csv(input_file, sep='\t', names=["doc_id", "doc_title", "doc_body"]) start_time = begin_time("Inverted Index Building") # first create id to tokens dictionary self.build_id_to_tokens_dict(file_df) # use that to find which tokens occur in which documents self.make_indices(self.id_tokens_map) # keep in memory the tf, idf scores self.generate_all_tfidf() end_time("Inverted Index Building", start_time)
def search(self, phrase): """ Generic search function. Splits query phrases and retrieves individual lists. """ start_time = begin_time("Document search") query_terms = re.sub("[^\w]", " ", phrase).lower() result = [] formatted_query = [] for term in query_terms.split(): # remove stopwords from query if term not in self.built_index.cached_stop_words: # stem words term = self.built_index.stemmer.stem(term) formatted_query.append(term) result += self.single_term_query(term) # get the duplicate ones, meaning, multiple query terms share those documents # for 3 term query we want the count of duplicates to be > 2 limit = max(1, len(formatted_query) - 1) intersection = set([x for x in result if result.count(x) > limit]) end_time("Document search", start_time) query_terms = ' '.join(formatted_query) if len(intersection) == 0: if len(query_terms.split()) <= 1: # phrase query self.results = self.rank_results(result, query_terms) else: self.results = self.rank_results(list(intersection), query_terms) # fancy printing print "Search Results:\n--------------" cnt = 0 while cnt < min(Setup.top_k_results, len(self.results)): result = self.results[cnt] print "{}\t{}".format(result, self.titles_map[result]) cnt += 1
def generate_all_tfidf(self): """ generating a tf-idf score and pre-populating it for each unique word in each document. We use here the tf and df to compute the score """ start_time = begin_time("tf-idf score computation") for term in self.complete_inverted_index: try: if term in self.df: self.idf[term] = self.get_idf_score( len(self.id_titles_map), self.df[term]) else: self.idf[term] = 0 except Exception as ex: raise Exception("Exception in tf-idf", ex) end_time("tf-idf score computation", start_time) return self.df, self.tf, self.idf