def get_dice_keywords(keyword_specification): training_dataset_spec = keyword_specification["training_dataset"] given_reference_words = keyword_specification["given_reference_words"] #Try load Keyword from cache else calculate keyword_seed_id = keyword_specification["seed_id"] raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec) if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id): raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id) else: raw_keywords = __calculate_raw_dice_keywords(keyword_specification) cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords) raw_reference_words = raw_keywords["raw_reference_words"] raw_context_words = raw_keywords["raw_context_words"] # Apply filter and clean from scores index_directory = training_dataset_spec["index_directory"] indices_id = training_dataset_spec["index_id"] index_spec = {"index_directory": index_directory, "index_id": indices_id} reference_words_filter = keyword_specification["reference_word_filter"] new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec) new_reference_words = __remove_dice_coefficients(new_reference_words) new_context_words = __remove_dice_coefficients(raw_context_words) reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words) return {"reference_words":reference_words,"context_words":new_context_words}
def __calculate_raw_dice_keywords(keyword_specification): # load paramters training_dataset_spec = keyword_specification["training_dataset"] index_directory = training_dataset_spec["index_directory"] indices_id = training_dataset_spec["index_id"] training_data_filters = training_dataset_spec["filters"] trainingdata_handler = TrainingDatasetHandler(training_dataset_spec["id"]) given_reference_words = keyword_specification["given_reference_words"] weight_limit = float(keyword_specification["parameters"]["weight_limit"]) max_number_of_key_words = int(keyword_specification["parameters"]["max_number_of_key_words"]) keyword_seed_id = keyword_specification["seed_id"] raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec) # Calculate keywords # calculate posting lists for reference word groups posting_lists = posting_list_handler.get_seed_words_posting_lists(given_reference_words,index_directory,indices_id) posting_lists_id = raw_keywords_id cache.write("posting_lists",posting_lists_id,posting_lists) print("Merged posting lists calculated") top_dice_neighbours = __get_top_n_dice_neighbours(max_number_of_key_words, posting_lists, index_directory, indices_id, trainingdata_handler, training_data_filters, raw_keywords_id) new_reference_words, new_context_words = __seperate_reference_and_context_words(top_dice_neighbours, weight_limit) return {"raw_reference_words": new_reference_words, "raw_context_words":new_context_words}