def __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id,
                                                n, index_id, index_directory,
                                                posting_lists, trainingdata_handler,
                                                training_data_filters):
    top_dice_neighbours = {}
    for category in posting_lists:
        top_dice_neighbours[category] = {}
        for id_reference_word_group in posting_lists[category]:
            top_dice_neighbours[category][id_reference_word_group] = []
    index = cache.load(index_directory, index_id)
    print("Index id "+ index_id)
    n_categories_left = len(posting_lists)
    for category in posting_lists:
        posting_lists_category = posting_lists[category]
        #TODO remove
        print("Calculate keyword for category "+category)
        n_posts = sum([len(posting_lists_category[ref_group_id]) for ref_group_id in posting_lists_category])
        print("Number of posts for category:" + str(n_posts))
        print("Number of categories left "+ str(n_categories_left))
        n_categories_left = n_categories_left -1
        for id_reference_word_group in posting_lists_category:
            posting_list_reference_word_group = posting_lists_category[id_reference_word_group]
            all_dice_neighbours = __get_all_dice_neighbours(posting_list_reference_word_group, index, trainingdata_handler, training_data_filters)
            top_dice_neighbours[category][id_reference_word_group] = all_dice_neighbours[:n]
    cache.write(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id, top_dice_neighbours)
def get_dice_keywords(keyword_specification):

    training_dataset_spec = keyword_specification["training_dataset"]
    given_reference_words = keyword_specification["given_reference_words"]

    #Try load Keyword from cache else calculate
    keyword_seed_id = keyword_specification["seed_id"]
    raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec)

    if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id):
        raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id)
    else:
        raw_keywords = __calculate_raw_dice_keywords(keyword_specification)
        cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords)

    raw_reference_words = raw_keywords["raw_reference_words"]
    raw_context_words = raw_keywords["raw_context_words"]

    # Apply filter and clean from scores
    index_directory = training_dataset_spec["index_directory"]
    indices_id = training_dataset_spec["index_id"]
    index_spec = {"index_directory": index_directory, "index_id": indices_id}

    reference_words_filter = keyword_specification["reference_word_filter"]
    new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec)

    new_reference_words = __remove_dice_coefficients(new_reference_words)
    new_context_words = __remove_dice_coefficients(raw_context_words)

    reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words)


    return {"reference_words":reference_words,"context_words":new_context_words}
def prepare_gold_standard_categorization(experiment_spec, gold_standard_categorization_directory):
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    if cache.in_cache(gold_standard_categorization_directory,test_data_id):
        print("Gold standard categorization in cache: "+ test_data_id)
        return
    test_dataset_handler = TestDatasetHandler(experiment_spec["test_dataset"]["id"])
    gold_standard_categorization = test_dataset_handler.get_gold_standard_categorization()
    pprint.pprint(gold_standard_categorization)
    cache.write(gold_standard_categorization_directory,test_data_id,gold_standard_categorization)
def prepare_index(experiment_spec, index_cache_directory):
    # Check that all indxes are created or init creation
    training_data_spec = experiment_spec["training_dataset"]
    index_specs = __get_all_index_specs(training_data_spec)
    for index_id in index_specs:
        index_spec = index_specs[index_id]
        if not cache.in_cache(index_cache_directory, index_id):
            index = index_factory.create_index(index_spec)
            cache.write(index_cache_directory, index_id, index)
            print("Created index " + index_id)
        else:
            print("Index present in cache " + index_id)
def prepare_freq_dists(experiment_spec, freq_dists_cache_directory):
    freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec)
    if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id):
        print( "FREQDISTS stored in cache: " + freq_dist_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    index_types = ["word", "bigram", "trigram"]
    freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types)
    pprint.pprint(freq_dist_map)
    cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
def fix_olympic_games_2(keywords_no_filter_id):
    if cache.in_cache(__RAW_KEYWORD_CACHE, keywords_no_filter_id):
        keywords= cache.load(__RAW_KEYWORD_CACHE, keywords_no_filter_id)
        keyword_types = list(keywords.keys())
        for keyword_type in keyword_types:
            if "olympic games" in keywords[keyword_type]:
                print("Fix olympic game in keyword directory " + keywords_no_filter_id)
                olympic_keywords = keywords[keyword_type].pop("olympic games")
                keywords[keyword_type]["olympic_games"] = olympic_keywords
            for category in keywords[keyword_type]:
                keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword]
        cache.write(__RAW_KEYWORD_CACHE, keywords_no_filter_id, keywords)
    return keywords
def fix_olympic_games_1(keywords_id):
    if cache.in_cache("keywords", keywords_id):
        keywords = cache.load("keywords", keywords_id)
        keyword_types = ["reference_words","context_words"]
        for keyword_type in keyword_types:
            if "olympic games" in keywords[keyword_type]:
                print("Fix olympic game in keyword directory " + keywords_id)
                olympic_keywords = keywords[keyword_type].pop("olympic games")
                keywords[keyword_type]["olympic_games"] = olympic_keywords
            for category in keywords[keyword_type]:
                keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword]
        cache.write("keywords", keywords_id, keywords)
    return keywords
def prepare_tf_idf_vectors(experiment_spec,tf_idf_cache_dirctory,index_cache_directory):
    # Create test data handler

    tf_idf_vector_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec)
    if cache.in_cache(tf_idf_cache_dirctory, tf_idf_vector_map_id):
        print( "TF_IDF_VECTORS stored in cache: " + tf_idf_vector_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_docuement_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    print("test data preprocessed")
    index_id_index_type_map = __get_index_id_index_type(experiment_spec["training_dataset"])
    index_types = ["word", "bigram", "trigram"]
    max_freq_map = index_factory.create_max_freq_term_by_index_types(test_docuement_term_map, index_types)
    print("max_freq_map_calculated")
    tf_idf_vector_map = document_vectorization.get_docs_id_tf_idf_map(test_docuement_term_map, index_id_index_type_map, index_cache_directory,max_freq_map)
    pprint.pprint(tf_idf_vector_map)
    cache.write(tf_idf_cache_dirctory, tf_idf_vector_map_id, tf_idf_vector_map)
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory):
    keyword_spec = experiment_spec["keywords"]
    #if manual keywords
    keyword_method = keyword_spec["keyword_generate_algorithm"]
    if keyword_method == "manual":
        keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"])
        keyword_setup_id = keyword_spec["setup_id"]
        keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
        cache.write(keyword_cache_directory,keyword_id,keywords)
        print("Manual keyword now stored in cache")
        return
    keyword_setup_id = keyword_spec["setup_id"]
    keyword_seed_id =keyword_spec["seed_id"]
    keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
    if cache.in_cache(keyword_cache_directory, keyword_id):
       print("Keyword stored in cache: "+keyword_id)
       keywords = cache.load(keyword_cache_directory,keyword_id)
       keyword_factory.print_keyword_setup_to_json(keyword_id,keywords)
       # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id)
       # print(keywords)
       return

    crete_new_keywords_spec = {}

    training_data_spec = experiment_spec["training_dataset"]
    crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"]
    crete_new_keywords_spec["training_dataset"] = training_data_spec
    crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory
    crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec)


    given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id)
    given_reference_words = seed_words_to_index_terms(given_reference_words)

    crete_new_keywords_spec["given_reference_words"] = given_reference_words
    crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"]
    crete_new_keywords_spec["parameters"] = keyword_spec["parameters"]
    crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"]
    keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id)
    cache.write(keyword_cache_directory,keyword_id, keywords)
def __calculate_raw_dice_keywords(keyword_specification):
    # load paramters
    training_dataset_spec = keyword_specification["training_dataset"]
    index_directory = training_dataset_spec["index_directory"]
    indices_id = training_dataset_spec["index_id"]
    training_data_filters = training_dataset_spec["filters"]

    trainingdata_handler = TrainingDatasetHandler(training_dataset_spec["id"])

    given_reference_words = keyword_specification["given_reference_words"]
    weight_limit = float(keyword_specification["parameters"]["weight_limit"])
    max_number_of_key_words = int(keyword_specification["parameters"]["max_number_of_key_words"])


    keyword_seed_id = keyword_specification["seed_id"]
    raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec)

    # Calculate keywords

    # calculate posting lists for reference word groups
    posting_lists = posting_list_handler.get_seed_words_posting_lists(given_reference_words,index_directory,indices_id)
    posting_lists_id = raw_keywords_id
    cache.write("posting_lists",posting_lists_id,posting_lists)
    print("Merged posting lists calculated")

    top_dice_neighbours = __get_top_n_dice_neighbours(max_number_of_key_words,
                                                     posting_lists,
                                                     index_directory,
                                                     indices_id,
                                                     trainingdata_handler,
                                                     training_data_filters,
                                                     raw_keywords_id)

    new_reference_words, new_context_words = __seperate_reference_and_context_words(top_dice_neighbours, weight_limit)

    return {"raw_reference_words": new_reference_words, "raw_context_words":new_context_words}