def _extract_document_summations(cls): print "\nExtracting document tf-idf summations for use in Vector Space Cosine..." if ENV.PROGRESS_BAR == True: util.update_progress(0) # for every term in our posting list for idx, term in enumerate(cls.posting_list): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(cls.posting_list))) docs = cls.posting_list[term] # run through the documents for each term and add the additional tfidf to an accumulation in the dict for doc in docs: tfidf_addition = qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys())) tfidf_addition_squared = np.square(tfidf_addition) if 'tf_idf_sum' in cls.doc_list[doc[0]]: cls.doc_list[doc[0]]['tf_idf_sum'] += tfidf_addition_squared else: cls.doc_list[doc[0]]['tf_idf_sum'] = tfidf_addition_squared if ENV.PROGRESS_BAR == True: util.update_progress(1) print "\nExtracting document weight summations for use in Vector Space Cosine..." if ENV.PROGRESS_BAR == True: util.update_progress(0) # Again, we run through each term in our posting list for idx, term in enumerate(cls.posting_list): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(cls.posting_list))) docs = cls.posting_list[term] # each doc within each term has the VS weight calculated for the terms to find a summation for doc in docs: weight_addition = float(qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys()))) / float(cls.doc_list[doc[0]]['tf_idf_sum']) weight_addition_squared = np.square(weight_addition) if 'sum_weight' in cls.doc_list[doc[0]]: cls.doc_list[doc[0]]['sum_weight'] += weight_addition_squared else: cls.doc_list[doc[0]]['sum_weight'] = weight_addition_squared if ENV.PROGRESS_BAR == True: util.update_progress(1)
def extract_vector_space_cosine_scores(query, index): # get terms and term frequencies from the query in format {term: tf} if ENV.QUERY_PROCESSING_INDEX == 'PHRASE': q_term_info_dict = query.extractValidPhrases(ENV.STOP_TERMS) else: q_term_info_dict = query.extractTermInformation() q_tid_info_dict = {} for term_name in q_term_info_dict: tid = index.get_term_id_by_term(term_name) if tid != None: q_tid_info_dict[tid] = q_term_info_dict[term_name] # retrieve posting list entries for all term ids in format: # { termId: [[doc1, tf], [doc2, tf]]} relevant_posting_entries = index.get_posting_entries_by_terms(q_term_info_dict.keys()) # calculate aggregate query term weight summation (for use in query weight function) query_total_summation = 0.0 for term_id in relevant_posting_entries: query_total_summation += np.square(qp.calculate_tf_idf(q_tid_info_dict[term_id], index.get_df_by_term_id(term_id), index.get_collection_size())) document_weights = {} for term_id in relevant_posting_entries: term_df = index.get_df_by_term_id(term_id) query_term_weight = calculate_term_weight(q_tid_info_dict[term_id], term_df, index.get_collection_size(), query_total_summation) for doc in relevant_posting_entries[term_id]: doc_id = doc[0] doc_tf = doc[1] document_term_weight = calculate_term_weight(doc_tf, term_df, index.get_collection_size(), index.get_document_weight_summation(doc_id)) added_weight = [query_term_weight, document_term_weight] if added_weight[0] == 0: continue elif doc_id in document_weights: document_weights[doc_id].append(added_weight) else: document_weights[doc_id] = [[query_term_weight, document_term_weight]] final_scores = [] # for each document, we sum the product of all the weights for doc in document_weights: final_scores.append([doc, calculate_vector_space_cosine(document_weights[doc], index.get_document_weight_summation2(doc))]) final_scores.sort(key=operator.itemgetter(1), reverse=True) return final_scores
def calculate_term_weight(tf, df, collection_size, document_tf_idf_summation): return qp.calculate_tf_idf(tf, df, collection_size) / document_tf_idf_summation