def __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id, n, index_id, index_directory, posting_lists, trainingdata_handler, training_data_filters): top_dice_neighbours = {} for category in posting_lists: top_dice_neighbours[category] = {} for id_reference_word_group in posting_lists[category]: top_dice_neighbours[category][id_reference_word_group] = [] index = cache.load(index_directory, index_id) print("Index id "+ index_id) n_categories_left = len(posting_lists) for category in posting_lists: posting_lists_category = posting_lists[category] #TODO remove print("Calculate keyword for category "+category) n_posts = sum([len(posting_lists_category[ref_group_id]) for ref_group_id in posting_lists_category]) print("Number of posts for category:" + str(n_posts)) print("Number of categories left "+ str(n_categories_left)) n_categories_left = n_categories_left -1 for id_reference_word_group in posting_lists_category: posting_list_reference_word_group = posting_lists_category[id_reference_word_group] all_dice_neighbours = __get_all_dice_neighbours(posting_list_reference_word_group, index, trainingdata_handler, training_data_filters) top_dice_neighbours[category][id_reference_word_group] = all_dice_neighbours[:n] cache.write(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id, top_dice_neighbours)
def get_dice_keywords(keyword_specification): training_dataset_spec = keyword_specification["training_dataset"] given_reference_words = keyword_specification["given_reference_words"] #Try load Keyword from cache else calculate keyword_seed_id = keyword_specification["seed_id"] raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec) if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id): raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id) else: raw_keywords = __calculate_raw_dice_keywords(keyword_specification) cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords) raw_reference_words = raw_keywords["raw_reference_words"] raw_context_words = raw_keywords["raw_context_words"] # Apply filter and clean from scores index_directory = training_dataset_spec["index_directory"] indices_id = training_dataset_spec["index_id"] index_spec = {"index_directory": index_directory, "index_id": indices_id} reference_words_filter = keyword_specification["reference_word_filter"] new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec) new_reference_words = __remove_dice_coefficients(new_reference_words) new_context_words = __remove_dice_coefficients(raw_context_words) reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words) return {"reference_words":reference_words,"context_words":new_context_words}
def prepare_gold_standard_categorization(experiment_spec, gold_standard_categorization_directory): test_data_id = dataset_id_handler.get_test_data_id(experiment_spec) if cache.in_cache(gold_standard_categorization_directory,test_data_id): print("Gold standard categorization in cache: "+ test_data_id) return test_dataset_handler = TestDatasetHandler(experiment_spec["test_dataset"]["id"]) gold_standard_categorization = test_dataset_handler.get_gold_standard_categorization() pprint.pprint(gold_standard_categorization) cache.write(gold_standard_categorization_directory,test_data_id,gold_standard_categorization)
def prepare_index(experiment_spec, index_cache_directory): # Check that all indxes are created or init creation training_data_spec = experiment_spec["training_dataset"] index_specs = __get_all_index_specs(training_data_spec) for index_id in index_specs: index_spec = index_specs[index_id] if not cache.in_cache(index_cache_directory, index_id): index = index_factory.create_index(index_spec) cache.write(index_cache_directory, index_id, index) print("Created index " + index_id) else: print("Index present in cache " + index_id)
def prepare_freq_dists(experiment_spec, freq_dists_cache_directory): freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec) if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id): print( "FREQDISTS stored in cache: " + freq_dist_map_id) return test_data_id = dataset_id_handler.get_test_data_id(experiment_spec) preprocessing_filter_names = experiment_spec["training_dataset"]["filters"] test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names) index_types = ["word", "bigram", "trigram"] freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types) pprint.pprint(freq_dist_map) cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
def fix_olympic_games_2(keywords_no_filter_id): if cache.in_cache(__RAW_KEYWORD_CACHE, keywords_no_filter_id): keywords= cache.load(__RAW_KEYWORD_CACHE, keywords_no_filter_id) keyword_types = list(keywords.keys()) for keyword_type in keyword_types: if "olympic games" in keywords[keyword_type]: print("Fix olympic game in keyword directory " + keywords_no_filter_id) olympic_keywords = keywords[keyword_type].pop("olympic games") keywords[keyword_type]["olympic_games"] = olympic_keywords for category in keywords[keyword_type]: keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword] cache.write(__RAW_KEYWORD_CACHE, keywords_no_filter_id, keywords) return keywords
def fix_olympic_games_1(keywords_id): if cache.in_cache("keywords", keywords_id): keywords = cache.load("keywords", keywords_id) keyword_types = ["reference_words","context_words"] for keyword_type in keyword_types: if "olympic games" in keywords[keyword_type]: print("Fix olympic game in keyword directory " + keywords_id) olympic_keywords = keywords[keyword_type].pop("olympic games") keywords[keyword_type]["olympic_games"] = olympic_keywords for category in keywords[keyword_type]: keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword] cache.write("keywords", keywords_id, keywords) return keywords
def prepare_tf_idf_vectors(experiment_spec,tf_idf_cache_dirctory,index_cache_directory): # Create test data handler tf_idf_vector_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec) if cache.in_cache(tf_idf_cache_dirctory, tf_idf_vector_map_id): print( "TF_IDF_VECTORS stored in cache: " + tf_idf_vector_map_id) return test_data_id = dataset_id_handler.get_test_data_id(experiment_spec) preprocessing_filter_names = experiment_spec["training_dataset"]["filters"] test_docuement_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names) print("test data preprocessed") index_id_index_type_map = __get_index_id_index_type(experiment_spec["training_dataset"]) index_types = ["word", "bigram", "trigram"] max_freq_map = index_factory.create_max_freq_term_by_index_types(test_docuement_term_map, index_types) print("max_freq_map_calculated") tf_idf_vector_map = document_vectorization.get_docs_id_tf_idf_map(test_docuement_term_map, index_id_index_type_map, index_cache_directory,max_freq_map) pprint.pprint(tf_idf_vector_map) cache.write(tf_idf_cache_dirctory, tf_idf_vector_map_id, tf_idf_vector_map)
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory): keyword_spec = experiment_spec["keywords"] #if manual keywords keyword_method = keyword_spec["keyword_generate_algorithm"] if keyword_method == "manual": keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"]) keyword_setup_id = keyword_spec["setup_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"]) cache.write(keyword_cache_directory,keyword_id,keywords) print("Manual keyword now stored in cache") return keyword_setup_id = keyword_spec["setup_id"] keyword_seed_id =keyword_spec["seed_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"]) if cache.in_cache(keyword_cache_directory, keyword_id): print("Keyword stored in cache: "+keyword_id) keywords = cache.load(keyword_cache_directory,keyword_id) keyword_factory.print_keyword_setup_to_json(keyword_id,keywords) # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id) # print(keywords) return crete_new_keywords_spec = {} training_data_spec = experiment_spec["training_dataset"] crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"] crete_new_keywords_spec["training_dataset"] = training_data_spec crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec) given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id) given_reference_words = seed_words_to_index_terms(given_reference_words) crete_new_keywords_spec["given_reference_words"] = given_reference_words crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"] crete_new_keywords_spec["parameters"] = keyword_spec["parameters"] crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"] keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id) cache.write(keyword_cache_directory,keyword_id, keywords)
def __calculate_raw_dice_keywords(keyword_specification): # load paramters training_dataset_spec = keyword_specification["training_dataset"] index_directory = training_dataset_spec["index_directory"] indices_id = training_dataset_spec["index_id"] training_data_filters = training_dataset_spec["filters"] trainingdata_handler = TrainingDatasetHandler(training_dataset_spec["id"]) given_reference_words = keyword_specification["given_reference_words"] weight_limit = float(keyword_specification["parameters"]["weight_limit"]) max_number_of_key_words = int(keyword_specification["parameters"]["max_number_of_key_words"]) keyword_seed_id = keyword_specification["seed_id"] raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec) # Calculate keywords # calculate posting lists for reference word groups posting_lists = posting_list_handler.get_seed_words_posting_lists(given_reference_words,index_directory,indices_id) posting_lists_id = raw_keywords_id cache.write("posting_lists",posting_lists_id,posting_lists) print("Merged posting lists calculated") top_dice_neighbours = __get_top_n_dice_neighbours(max_number_of_key_words, posting_lists, index_directory, indices_id, trainingdata_handler, training_data_filters, raw_keywords_id) new_reference_words, new_context_words = __seperate_reference_and_context_words(top_dice_neighbours, weight_limit) return {"raw_reference_words": new_reference_words, "raw_context_words":new_context_words}