def start_loading(self, project): cache.load(self.filename) self.project_loader = ProjectLoader(project) self.project_loader.started.connect(self.started) self.project_loader.progress.connect(self.progress) self.project_loader.loaded.connect(self.loaded) self.project_loader.start(QtCore.QThread.IdlePriority)
def __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id, n, index_id, index_directory, posting_lists, trainingdata_handler, training_data_filters): top_dice_neighbours = {} for category in posting_lists: top_dice_neighbours[category] = {} for id_reference_word_group in posting_lists[category]: top_dice_neighbours[category][id_reference_word_group] = [] index = cache.load(index_directory, index_id) print("Index id "+ index_id) n_categories_left = len(posting_lists) for category in posting_lists: posting_lists_category = posting_lists[category] #TODO remove print("Calculate keyword for category "+category) n_posts = sum([len(posting_lists_category[ref_group_id]) for ref_group_id in posting_lists_category]) print("Number of posts for category:" + str(n_posts)) print("Number of categories left "+ str(n_categories_left)) n_categories_left = n_categories_left -1 for id_reference_word_group in posting_lists_category: posting_list_reference_word_group = posting_lists_category[id_reference_word_group] all_dice_neighbours = __get_all_dice_neighbours(posting_list_reference_word_group, index, trainingdata_handler, training_data_filters) top_dice_neighbours[category][id_reference_word_group] = all_dice_neighbours[:n] cache.write(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id, top_dice_neighbours)
def get_dice_keywords(keyword_specification): training_dataset_spec = keyword_specification["training_dataset"] given_reference_words = keyword_specification["given_reference_words"] #Try load Keyword from cache else calculate keyword_seed_id = keyword_specification["seed_id"] raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec) if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id): raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id) else: raw_keywords = __calculate_raw_dice_keywords(keyword_specification) cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords) raw_reference_words = raw_keywords["raw_reference_words"] raw_context_words = raw_keywords["raw_context_words"] # Apply filter and clean from scores index_directory = training_dataset_spec["index_directory"] indices_id = training_dataset_spec["index_id"] index_spec = {"index_directory": index_directory, "index_id": indices_id} reference_words_filter = keyword_specification["reference_word_filter"] new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec) new_reference_words = __remove_dice_coefficients(new_reference_words) new_context_words = __remove_dice_coefficients(raw_context_words) reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words) return {"reference_words":reference_words,"context_words":new_context_words}
def freqent_term_filter(new_reference_words, index_directory, indices_id, frequence_limit): for index_id in indices_id: index = cache.load(index_directory, index_id) n_document_in_index = index["n_documents"] new_reference_words = __frequent_term_filtering(new_reference_words, index["index"], n_document_in_index, frequence_limit) return new_reference_words
def fix_olympic_games_2(keywords_no_filter_id): if cache.in_cache(__RAW_KEYWORD_CACHE, keywords_no_filter_id): keywords= cache.load(__RAW_KEYWORD_CACHE, keywords_no_filter_id) keyword_types = list(keywords.keys()) for keyword_type in keyword_types: if "olympic games" in keywords[keyword_type]: print("Fix olympic game in keyword directory " + keywords_no_filter_id) olympic_keywords = keywords[keyword_type].pop("olympic games") keywords[keyword_type]["olympic_games"] = olympic_keywords for category in keywords[keyword_type]: keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword] cache.write(__RAW_KEYWORD_CACHE, keywords_no_filter_id, keywords) return keywords
def fix_olympic_games_1(keywords_id): if cache.in_cache("keywords", keywords_id): keywords = cache.load("keywords", keywords_id) keyword_types = ["reference_words","context_words"] for keyword_type in keyword_types: if "olympic games" in keywords[keyword_type]: print("Fix olympic game in keyword directory " + keywords_id) olympic_keywords = keywords[keyword_type].pop("olympic games") keywords[keyword_type]["olympic_games"] = olympic_keywords for category in keywords[keyword_type]: keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword] cache.write("keywords", keywords_id, keywords) return keywords
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory): keyword_spec = experiment_spec["keywords"] #if manual keywords keyword_method = keyword_spec["keyword_generate_algorithm"] if keyword_method == "manual": keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"]) keyword_setup_id = keyword_spec["setup_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"]) cache.write(keyword_cache_directory,keyword_id,keywords) print("Manual keyword now stored in cache") return keyword_setup_id = keyword_spec["setup_id"] keyword_seed_id =keyword_spec["seed_id"] keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"]) if cache.in_cache(keyword_cache_directory, keyword_id): print("Keyword stored in cache: "+keyword_id) keywords = cache.load(keyword_cache_directory,keyword_id) keyword_factory.print_keyword_setup_to_json(keyword_id,keywords) # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id) # print(keywords) return crete_new_keywords_spec = {} training_data_spec = experiment_spec["training_dataset"] crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"] crete_new_keywords_spec["training_dataset"] = training_data_spec crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec) given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id) given_reference_words = seed_words_to_index_terms(given_reference_words) crete_new_keywords_spec["given_reference_words"] = given_reference_words crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"] crete_new_keywords_spec["parameters"] = keyword_spec["parameters"] crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"] keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id) cache.write(keyword_cache_directory,keyword_id, keywords)
def __get_top_n_dice_neighbours(n, posting_lists, index_directory, indices_id, trainingdata_handler, training_data_filters, raw_keyword_id): # return a map of top n dice neighbours for each group of reference words #init top_dice_neighbours_structure top_dice_neighbours = {} for category in posting_lists: top_dice_neighbours[category] = {} for id_reference_word_group in posting_lists[category]: top_dice_neighbours[category][id_reference_word_group] = [] for index_id in indices_id: temporary_keyword_cache_id = "temporary_cached_keywords" temporary_keyword_cache_id = temporary_keyword_cache_id + "_"+raw_keyword_id + "_"+index_id if not cache.in_cache(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id): __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id,n, index_id,index_directory, posting_lists, trainingdata_handler, training_data_filters) print("All keyword for index calculated "+ index_id) top_dice_neighbours_by_index = {} for index_id in indices_id: temporary_keyword_cache_id = "temporary_cached_keywords" temporary_keyword_cache_id = temporary_keyword_cache_id + "_"+raw_keyword_id + "_"+index_id top_dice_neighbours_by_index[index_id] = cache.load(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id) for index_id in indices_id: present_top_dice_neighbours_by_index = top_dice_neighbours_by_index[index_id] for category in present_top_dice_neighbours_by_index: for id_reference_word_group in present_top_dice_neighbours_by_index[category]: top_dice_neighbours[category][id_reference_word_group] = top_dice_neighbours[category][id_reference_word_group] + present_top_dice_neighbours_by_index[category][id_reference_word_group] # sort dice neighbours after dice_coefficients and short to n neighbours for category in top_dice_neighbours: for id_reference_word_group in top_dice_neighbours[category]: top_dice_neighbours[category][id_reference_word_group].sort(key=itemgetter(1), reverse=True) top_dice_neighbours[category][id_reference_word_group] = top_dice_neighbours[category][id_reference_word_group][:n] return top_dice_neighbours
def get_seed_words_posting_lists(seed_words_spec, index_directory, indices_id): # Return a map of postings for each reference_word_group_id posting_lists={} # set up structure of posting list map for category in seed_words_spec: posting_lists[category] = {} for index_id in indices_id: index = cache.load(index_directory, index_id) print("merging postining_list for index:") print(index["index_type"]) index = index["index"] for category in seed_words_spec: reference_word_groups = seed_words_spec[category] for id_reference_word_group in reference_word_groups: if not id_reference_word_group in posting_lists[category]: posting_lists[category][id_reference_word_group] = [] for reference_word in reference_word_groups[id_reference_word_group]: reference_term = n_gram_handler.string_to_index_term(reference_word) if reference_term in index: merged_postings = index_operations.get_merged_posting_lists(posting_lists[category][id_reference_word_group], index[reference_term]) posting_lists[category][id_reference_word_group] = merged_postings return posting_lists