def get_dice_keywords(keyword_specification):

    training_dataset_spec = keyword_specification["training_dataset"]
    given_reference_words = keyword_specification["given_reference_words"]

    #Try load Keyword from cache else calculate
    keyword_seed_id = keyword_specification["seed_id"]
    raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec)

    if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id):
        raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id)
    else:
        raw_keywords = __calculate_raw_dice_keywords(keyword_specification)
        cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords)

    raw_reference_words = raw_keywords["raw_reference_words"]
    raw_context_words = raw_keywords["raw_context_words"]

    # Apply filter and clean from scores
    index_directory = training_dataset_spec["index_directory"]
    indices_id = training_dataset_spec["index_id"]
    index_spec = {"index_directory": index_directory, "index_id": indices_id}

    reference_words_filter = keyword_specification["reference_word_filter"]
    new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec)

    new_reference_words = __remove_dice_coefficients(new_reference_words)
    new_context_words = __remove_dice_coefficients(raw_context_words)

    reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words)


    return {"reference_words":reference_words,"context_words":new_context_words}
def prepare_gold_standard_categorization(experiment_spec, gold_standard_categorization_directory):
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    if cache.in_cache(gold_standard_categorization_directory,test_data_id):
        print("Gold standard categorization in cache: "+ test_data_id)
        return
    test_dataset_handler = TestDatasetHandler(experiment_spec["test_dataset"]["id"])
    gold_standard_categorization = test_dataset_handler.get_gold_standard_categorization()
    pprint.pprint(gold_standard_categorization)
    cache.write(gold_standard_categorization_directory,test_data_id,gold_standard_categorization)
def prepare_index(experiment_spec, index_cache_directory):
    # Check that all indxes are created or init creation
    training_data_spec = experiment_spec["training_dataset"]
    index_specs = __get_all_index_specs(training_data_spec)
    for index_id in index_specs:
        index_spec = index_specs[index_id]
        if not cache.in_cache(index_cache_directory, index_id):
            index = index_factory.create_index(index_spec)
            cache.write(index_cache_directory, index_id, index)
            print("Created index " + index_id)
        else:
            print("Index present in cache " + index_id)
def prepare_freq_dists(experiment_spec, freq_dists_cache_directory):
    freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec)
    if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id):
        print( "FREQDISTS stored in cache: " + freq_dist_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    index_types = ["word", "bigram", "trigram"]
    freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types)
    pprint.pprint(freq_dist_map)
    cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
def fix_olympic_games_2(keywords_no_filter_id):
    if cache.in_cache(__RAW_KEYWORD_CACHE, keywords_no_filter_id):
        keywords= cache.load(__RAW_KEYWORD_CACHE, keywords_no_filter_id)
        keyword_types = list(keywords.keys())
        for keyword_type in keyword_types:
            if "olympic games" in keywords[keyword_type]:
                print("Fix olympic game in keyword directory " + keywords_no_filter_id)
                olympic_keywords = keywords[keyword_type].pop("olympic games")
                keywords[keyword_type]["olympic_games"] = olympic_keywords
            for category in keywords[keyword_type]:
                keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword]
        cache.write(__RAW_KEYWORD_CACHE, keywords_no_filter_id, keywords)
    return keywords
def fix_olympic_games_1(keywords_id):
    if cache.in_cache("keywords", keywords_id):
        keywords = cache.load("keywords", keywords_id)
        keyword_types = ["reference_words","context_words"]
        for keyword_type in keyword_types:
            if "olympic games" in keywords[keyword_type]:
                print("Fix olympic game in keyword directory " + keywords_id)
                olympic_keywords = keywords[keyword_type].pop("olympic games")
                keywords[keyword_type]["olympic_games"] = olympic_keywords
            for category in keywords[keyword_type]:
                keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword]
        cache.write("keywords", keywords_id, keywords)
    return keywords
def prepare_tf_idf_vectors(experiment_spec,tf_idf_cache_dirctory,index_cache_directory):
    # Create test data handler

    tf_idf_vector_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec)
    if cache.in_cache(tf_idf_cache_dirctory, tf_idf_vector_map_id):
        print( "TF_IDF_VECTORS stored in cache: " + tf_idf_vector_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_docuement_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    print("test data preprocessed")
    index_id_index_type_map = __get_index_id_index_type(experiment_spec["training_dataset"])
    index_types = ["word", "bigram", "trigram"]
    max_freq_map = index_factory.create_max_freq_term_by_index_types(test_docuement_term_map, index_types)
    print("max_freq_map_calculated")
    tf_idf_vector_map = document_vectorization.get_docs_id_tf_idf_map(test_docuement_term_map, index_id_index_type_map, index_cache_directory,max_freq_map)
    pprint.pprint(tf_idf_vector_map)
    cache.write(tf_idf_cache_dirctory, tf_idf_vector_map_id, tf_idf_vector_map)
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory):
    keyword_spec = experiment_spec["keywords"]
    #if manual keywords
    keyword_method = keyword_spec["keyword_generate_algorithm"]
    if keyword_method == "manual":
        keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"])
        keyword_setup_id = keyword_spec["setup_id"]
        keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
        cache.write(keyword_cache_directory,keyword_id,keywords)
        print("Manual keyword now stored in cache")
        return
    keyword_setup_id = keyword_spec["setup_id"]
    keyword_seed_id =keyword_spec["seed_id"]
    keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
    if cache.in_cache(keyword_cache_directory, keyword_id):
       print("Keyword stored in cache: "+keyword_id)
       keywords = cache.load(keyword_cache_directory,keyword_id)
       keyword_factory.print_keyword_setup_to_json(keyword_id,keywords)
       # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id)
       # print(keywords)
       return

    crete_new_keywords_spec = {}

    training_data_spec = experiment_spec["training_dataset"]
    crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"]
    crete_new_keywords_spec["training_dataset"] = training_data_spec
    crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory
    crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec)


    given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id)
    given_reference_words = seed_words_to_index_terms(given_reference_words)

    crete_new_keywords_spec["given_reference_words"] = given_reference_words
    crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"]
    crete_new_keywords_spec["parameters"] = keyword_spec["parameters"]
    crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"]
    keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id)
    cache.write(keyword_cache_directory,keyword_id, keywords)
def __get_top_n_dice_neighbours(n, posting_lists, index_directory, indices_id, trainingdata_handler, training_data_filters, raw_keyword_id):
    # return a map of top n dice neighbours for each group of reference words

    #init top_dice_neighbours_structure
    top_dice_neighbours = {}
    for category in posting_lists:
        top_dice_neighbours[category] = {}
        for id_reference_word_group in posting_lists[category]:
            top_dice_neighbours[category][id_reference_word_group] = []

    for index_id in indices_id:
        temporary_keyword_cache_id = "temporary_cached_keywords"
        temporary_keyword_cache_id = temporary_keyword_cache_id + "_"+raw_keyword_id + "_"+index_id
        if not cache.in_cache(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id):
            __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id,n,
                                                    index_id,index_directory,
                                                    posting_lists, trainingdata_handler,
                                                    training_data_filters)
        print("All keyword for index calculated "+ index_id)

    top_dice_neighbours_by_index = {}
    for index_id in indices_id:
        temporary_keyword_cache_id = "temporary_cached_keywords"
        temporary_keyword_cache_id = temporary_keyword_cache_id + "_"+raw_keyword_id + "_"+index_id
        top_dice_neighbours_by_index[index_id] = cache.load(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id)


    for index_id in indices_id:
        present_top_dice_neighbours_by_index = top_dice_neighbours_by_index[index_id]
        for category in present_top_dice_neighbours_by_index:
            for id_reference_word_group in present_top_dice_neighbours_by_index[category]:
                top_dice_neighbours[category][id_reference_word_group] = top_dice_neighbours[category][id_reference_word_group] + present_top_dice_neighbours_by_index[category][id_reference_word_group]

    # sort dice neighbours after dice_coefficients and short to n neighbours
    for category in top_dice_neighbours:
        for id_reference_word_group in top_dice_neighbours[category]:
            top_dice_neighbours[category][id_reference_word_group].sort(key=itemgetter(1), reverse=True)
            top_dice_neighbours[category][id_reference_word_group] = top_dice_neighbours[category][id_reference_word_group][:n]
    return top_dice_neighbours