self.directory_results[resources] = {} self.directory_results[resources][ 'mean_reciprocal_rank'] = mean_reciprocal_rank( directory_results_mrr) self.directory_results[resources][ 'mean_average_precision'] = mean_average_precision( directory_results_mrr) all_results_mrr += directory_results_mrr return self.inner_results, self.directory_results, { "mean_reciprocal_rank": mean_reciprocal_rank(all_results_mrr), "mean_average_precision": mean_average_precision(all_results_mrr) } finalEvaluator = FinalEvaluator("output/results.data") inner_results, directory_results, outer_results = finalEvaluator.final_evaluate( ) print_model_into_file(inner_results, "output/basic_model/0_inner_results.json") print_model_into_file(directory_results, "output/basic_model/1_directory_results.json") print_model_into_file(outer_results, "output/basic_model/2_outer_results.json") finalEvaluator = FinalEvaluator("output/results_bm25.data") inner_results, directory_results, outer_results = finalEvaluator.final_evaluate( ) print_model_into_file(inner_results, "output/bm25_model/0_inner_results.json") print_model_into_file(directory_results, "output/bm25_model/1_directory_results.json") print_model_into_file(outer_results, "output/bm25_model/2_outer_results.json")
from utils import print_model_into_pickle_file, print_model_into_file class EvaluatorBM25: def __init__(self): self.modelQuery = ModelQueryBM25(modelfile="model_files/output_bm25.data") self.results = {} def evaluate(self, path): for root, dirs, files in os.walk(path): self.results[root] = {} if root != 'news_resources/': print('files in ' + root + ': ' + ', '.join(files)) for file in files: if file.endswith('.json'): fullpath = root + '/' + file article = json.load(open(fullpath), object_hook=json_util.object_hook) # load article. results = self.modelQuery.search_for_query(article['title']) self.results[root][article['article_id']] = {"results": results, "article": article} return self.results start = datetime.now() evaluator = EvaluatorBM25() results = evaluator.evaluate("news_resources/") print_model_into_pickle_file(results, "output/results_bm25.data") end = datetime.now() print_model_into_file({"time_start": start, "time_end": end, "diff": str(end - start)}, "output/timings/evaluation_bm25.json")
self.json_files[root]) self.total_statistics['total_number_of_documents'] += len( self.json_files[root]) self.total_statistics[ 'total_avarage_query_length'] += self.statistics[root][ 'avarage_query_length'] self.total_statistics[ 'total_avarage_text_length'] += self.statistics[root][ 'avarage_text_length'] self.total_statistics['total_avarage_query_length'] /= len( self.json_files) self.total_statistics['total_avarage_text_length'] /= len( self.json_files) return self.json_files, self.statistics, self.total_statistics INPUT_DIRECTORY = "news_resources/" OUTPUT_FILEPATH = "output/" datasetAnalyzer = DatasetAnalyzer() json_files, statistics, total_statistics = datasetAnalyzer.analyze( INPUT_DIRECTORY) print_model_into_file(statistics, OUTPUT_FILEPATH + "/dataset_statistics/statistics.json") print_model_into_file( total_statistics, OUTPUT_FILEPATH + "/dataset_statistics/total_statistics.json")
def append_inverse_document_frequency(self): docs, words = self.bag_of_words.shape for i in range(words): frequency_per_doc = self.bag_of_words[:, i] nw = 0 for j in range(len(frequency_per_doc)): if frequency_per_doc[j] != 0: nw += 1 idf = np.log(docs / nw) self.bag_of_words[:, i] *= idf start = datetime.now() indexer = Indexer() indexer.fill_terms("news_resources/") indexer.fill_bags_of_words() indexer.append_inverse_document_frequency() print_model_into_pickle_file( { 'matrix': scipy.sparse.csr_matrix(indexer.bag_of_words), 'articles': indexer.article_list, 'terms': indexer.terms }, "model_files/output.data") end = datetime.now() print_model_into_file( { "time_start": start, "time_end": end, "diff": str(end - start) }, "output/timings/index_creation.json")