def binary_search_test(tokens, doc_ids): # Genarte inverted index for the collection print("Generating inverted index") start_time = time() inverted_index = InvertedIndex.invert_index(tokens) end_time = time() print("Genartion finished, took: %.2f seconds" % (end_time - start_time)) print("Binary search AND test starting") start_time = time() res = binarySearch("network AND computer", inverted_index, doc_ids) end_time = time() print("AND test finished, returned: {}, took: {} seconds".format( res, end_time - start_time)) print("Binary search OR test starting") start_time = time() res = binarySearch("network OR computer", inverted_index, doc_ids) end_time = time() print("OR test finished, returned: {}, took: {} seconds".format( res[10], end_time - start_time)) print("Binary search NOT test starting") start_time = time() res = binarySearch("NOT computer", inverted_index, doc_ids) end_time = time() print("NOT test finished, returned: {}, took: {} seconds".format( res[10], end_time - start_time)) return True
def test_accuracy_recall(doc_list, tokens, return_doc_count): inverted_index = InvertedIndex.invert_index(tokens) test_queries = parse_query_text() test_rels = parse_qrels() accuracies = [] recalls = [] docs = {} common_words = DocumentParser.read_common_words() for doc in doc_list: docs[doc.id] = len( DocumentParser.remove_common_words(doc.tokenize(), common_words)) for key in tqdm(list(test_rels.keys())): cleaned_query = VectorialModel.parse_query(test_queries[key]) postings = VectorialModel.posting_union(cleaned_query, inverted_index) vectors = VectorialModel.doc_vectors_ponderation( postings, cleaned_query, inverted_index, docs) cleaned_query = VectorialModel.generate_query_vector( cleaned_query, inverted_index, len(doc_list)) cosines = VectorialModel.cosinus(cleaned_query, vectors) res = VectorialModel.search_result(cosines, postings) res = res[:return_doc_count] related_docs = test_rels[key] tp = 0 fn = 0 for doc in res: if str(doc) in related_docs: tp += 1 else: fn += 1 accuracy = tp / len(res) recall = tp / (tp + fn) accuracies.append(accuracy) recalls.append(recall) accuracies = np.asarray(accuracies) recalls = np.asarray(recalls) mean_accuracy = np.mean(accuracies) min_accuracy = np.min(accuracies) max_accuracy = np.max(accuracies) print("Accuracy, recall tests for %i documents returned" % return_doc_count) print("Accuracy results: Mean %.2f, Max %.2f, Min %.2f" % (mean_accuracy, max_accuracy, min_accuracy)) mean_recall = np.mean(recalls) min_recall = np.min(recalls) max_recall = np.max(recalls) print("Recall results: Mean %.2f, Max %.2f, Min %.2f" % (mean_recall, max_recall, min_recall)) beta = mean_accuracy / mean_recall alpha = 1 / (beta**2 + 1) e_mesure = 1 - 1 / (alpha / mean_accuracy + (1 - alpha) / mean_recall) f_mesure = 1 - e_mesure print("E-mesure: %.2f, F-Mesure: %.2f" % (e_mesure, f_mesure))
def vectorial_search_test(doc_list, tokens): # Genarte inverted index for the collection print("Generating inverted index") start_time = time() inverted_index = InvertedIndex.invert_index(tokens) end_time = time() print("Genartion finished, took: %.2f seconds" % (end_time - start_time)) cleaned_query = VectorialModel.parse_query( "computer science applied to networks") postings = VectorialModel.posting_union(cleaned_query, inverted_index) print("Genarting document vectors") start_time = time() vectors = VectorialModel.doc_vectors(postings, cleaned_query, inverted_index) end_time = time() print("Generation finished, took %.2f seconds" % (end_time - start_time)) print("Searching using cosine measurement") start_time = time() cleaned_query = VectorialModel.generate_query_vector( cleaned_query, inverted_index, len(doc_list)) cosines = VectorialModel.cosinus(cleaned_query, vectors) res = VectorialModel.search_result(cosines, postings) end_time = time() print("Search finished, took %.2f seconds" % (end_time - start_time)) print("First ten results: {}".format(res[:10])) docs = {} common_words = DocumentParser.read_common_words() for doc in doc_list: docs[doc.id] = len( DocumentParser.remove_common_words(doc.tokenize(), common_words)) print("Genarting ponderated vectors") start_time = time() ponderated_vectors = VectorialModel.doc_vectors_ponderation( postings, cleaned_query, inverted_index, docs) end_time = time() print("Genartion finished, took %.2f seconds" % (end_time - start_time)) print("Searching using cosine measurement") start_time = time() p_cosines = VectorialModel.cosinus(cleaned_query, ponderated_vectors) res = VectorialModel.search_result(p_cosines, postings) end_time = time() print("Search finished, took %.2f seconds" % (end_time - start_time)) print("First ten results: {}".format(res[:10])) return True
def test_accuracy_recall(documents, tokens, return_doc_count=10): inverted_index = InvertedIndex.invert_index(tokens) test_queries = parse_query_text() test_rels = parse_qrels() accuracies = [] recalls = [] docs = {} for doc_id in documents.keys(): docs[doc_id] = len(documents[doc_id].post_tokens) for key in tqdm(list(test_rels.keys())): cleaned_query = VectorialModel.parse_query(test_queries[key]) postings = VectorialModel.posting_union(cleaned_query, inverted_index) vectors = VectorialModel.doc_vectors_ponderation(postings, cleaned_query, inverted_index, docs) query_vector = VectorialModel.generate_query_vector(cleaned_query, inverted_index, len(documents.keys())) cosines = VectorialModel.cosinus(query_vector, vectors) res = VectorialModel.search_result(cosines, postings) res = res[:return_doc_count] related_docs = test_rels[key] tp = 0 fn = 0 for doc in res: if str(doc) in related_docs: tp+=1 else: fn+=1 accuracy = tp / len(res) recall = tp/(tp + fn) accuracies.append(accuracy) recalls.append(recall) # the following lines are used to test that the documents returned # are indeed relevant. print(test_queries[key]) for doc_id in res[:10]: print(documents[str(doc_id)].title) break accuracies = np.asarray(accuracies) recalls = np.asarray(recalls) mean_accuracy = np.mean(accuracies) min_accuracy = np.min(accuracies) max_accuracy = np.max(accuracies) print("Accuracy, recall tests for %i documents returned" % return_doc_count) print("Accuracy results: Mean %.2f, Max %.2f, Min %.2f" % (mean_accuracy, max_accuracy, min_accuracy)) mean_recall = np.mean(recalls) min_recall = np.min(recalls) max_recall = np.max(recalls) print("Recall results: Mean %.2f, Max %.2f, Min %.2f" % (mean_recall, max_recall, min_recall)) beta = max_accuracy / max_recall alpha = 1 / (beta**2 + 1) e_mesure = 1 - 1/(alpha/max_accuracy + (1 - alpha)/max_recall) f_mesure = 1 - e_mesure print("E-mesure: %.2f, F-Mesure: %.2f" % (e_mesure, f_mesure))
def accuracy_recall_graph(doc_list, tokens): inverted_index = InvertedIndex.invert_index(tokens) test_queries = parse_query_text() test_rels = parse_qrels() accuracies = [] recalls = [] keys = list(test_rels.keys()) query = test_queries[keys[0]] rels = test_rels[keys[0]] cleaned_query = VectorialModel.parse_query(query) postings = VectorialModel.posting_union(cleaned_query, inverted_index) vectors = VectorialModel.doc_vectors(postings, cleaned_query, inverted_index) cleaned_query = VectorialModel.generate_query_vector( cleaned_query, inverted_index, len(doc_list)) cosines = VectorialModel.cosinus(cleaned_query, vectors) res = VectorialModel.search_result(cosines, postings) for i in range(1, 51): temp_res = res[:i] tp = 0 fn = 0 for doc in temp_res: if str(doc) in rels: tp += 1 else: fn += 1 accuracy = tp / len(res) recall = tp / (tp + fn) accuracies.append(accuracy) recalls.append(recall) print(accuracies) print(recalls) x = np.linspace(1, 50, 50) plt.plot(x, accuracies, color="r") plt.plot(x, recalls, color="b") plt.show()
beta = max_accuracy / max_recall alpha = 1 / (beta**2 + 1) e_mesure = 1 - 1/(alpha/max_accuracy + (1 - alpha)/max_recall) f_mesure = 1 - e_mesure print("E-mesure: %.2f, F-Mesure: %.2f" % (e_mesure, f_mesure)) if __name__ == "__main__": common_words = read_stop_words(common_words_file) docs = parse_entry_file(file, common_words) tokens, vocab = calculate_tokens_and_vocab(docs) print("Number of tokens: %i, Vocabulary size: %i" % (len(tokens), len(vocab))) htokens , hvocab = half_way_tokens_and_vocab(docs) print("Halfway tokens: %i, Halfway vocab: %i" % (len(htokens), len(hvocab))) b = log(float(len(vocab)/len(hvocab)))/log(len(tokens)/len(htokens)) k = len(vocab)/pow(len(tokens), b) print("k: %.2f, b: %.2f" % (k, b)) voc_million = voc_million = k * (10**6)**b print("Vocabulary for 1 million tokens: %.2f" % voc_million) tokens = get_tokens_dict(docs) inverted_index = InvertedIndex.invert_index(tokens) frequencies = calculate_term_frequencies(inverted_index) log_freqs = [log(freq) for freq in frequencies] ranks = [i for i in range(1, len(frequencies) + 1)] log_ranks = [log(rank) for rank in ranks] frequencies = frequencies[::-1] # plt.plot(ranks[:200], frequencies[:200]) # plt.show() # plt.plot(log_ranks[:200], log_freqs[:200]) # plt.show() test_accuracy_recall(docs, tokens, 50)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "h f", [ "help", "file", ]) except getopt.GetoptError: print("For help, type: --help") sys.exit(2) HELP = """ This program tokenizes an input file respecting the following format, then calculates it's vocabulary: - .I : document's id - .T : indicates start of document's title - .W : indicates start of document's summary - .K : indicates the start of the document's keywords options: -h --help : shows this help message -f --file : the path to the input file """ filename = None for o, a in opts: if o in ("-h", "--help"): print(HELP) sys.exit(0) elif o in ("-f", "--file"): i = opts[0].index(o) if (len(args) < i): print("Please enter a filename") sys.exit(1) filename = args[i] if filename is None: print("Please provide a filename") doc_list = DocumentParser.parse_entry_file(filename) tokens, token_counts = DocumentParser.tokenize(doc_list, "cacm/common_words") vocab, vocab_lengths = DocumentParser.calculate_vocabulary(tokens) for i in range(len(token_counts)): token_counts[i] = math.log10(token_counts[i]) for i in range(len(vocab_lengths)): vocab_lengths[i] = math.log10(vocab_lengths[i]) token_counts = np.asarray(token_counts) vocab_lengths = np.asarray(vocab_lengths) # coefs = estimate_coef(token_counts, vocab_lengths) # print(coefs) # k = 10**coefs[0] # b = coefs[1] # print("k= " + str(10**coefs[0]) + " b= " + str(coefs[1])) # voc_million = k * (10**6)**b # print("Vocabulaire 1million tokens: " + str(voc_million)) # plot_regression_line(token_counts, vocab_lengths, coefs) doc_ids = [] for doc in doc_list: doc_ids.append(doc.id) inverted_index = InvertedIndex.invert_index(tokens) # res = binarySearch("network AND computer", inverted_index, doc_ids) # print(res) cleaned_q = VectorialModel.parse_query( "computer science applied to networks") posting = VectorialModel.posting_union(cleaned_q, inverted_index) vectors = VectorialModel.doc_vectors(posting, cleaned_q, inverted_index) cosines = VectorialModel.cosinus(cleaned_q, vectors) vecmod_result = VectorialModel.search_result(cosines, posting) print(vecmod_result[:10]) docs = {} common_words = DocumentParser.read_common_words() for doc in doc_list: docs[doc.id] = len( DocumentParser.remove_common_words(doc.tokenize(), common_words)) ponderated_vectors = VectorialModel.doc_vectors_ponderation( posting, cleaned_q, inverted_index, docs) p_cosines = VectorialModel.cosinus(cleaned_q, ponderated_vectors) p_vecmod_result = VectorialModel.search_result(p_cosines, posting) print(p_vecmod_result[:10]) print(ponderated_vectors[posting.index(2900)]) tokens = None token_counts = None vocab_lengths = None vocab = None gc.collect() sys.exit(0)