def get_dice_keywords(keyword_specification):

    training_dataset_spec = keyword_specification["training_dataset"]
    given_reference_words = keyword_specification["given_reference_words"]

    #Try load Keyword from cache else calculate
    keyword_seed_id = keyword_specification["seed_id"]
    raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec)

    if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id):
        raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id)
    else:
        raw_keywords = __calculate_raw_dice_keywords(keyword_specification)
        cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords)

    raw_reference_words = raw_keywords["raw_reference_words"]
    raw_context_words = raw_keywords["raw_context_words"]

    # Apply filter and clean from scores
    index_directory = training_dataset_spec["index_directory"]
    indices_id = training_dataset_spec["index_id"]
    index_spec = {"index_directory": index_directory, "index_id": indices_id}

    reference_words_filter = keyword_specification["reference_word_filter"]
    new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec)

    new_reference_words = __remove_dice_coefficients(new_reference_words)
    new_context_words = __remove_dice_coefficients(raw_context_words)

    reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words)


    return {"reference_words":reference_words,"context_words":new_context_words}
def __calculate_raw_dice_keywords(keyword_specification):
    # load paramters
    training_dataset_spec = keyword_specification["training_dataset"]
    index_directory = training_dataset_spec["index_directory"]
    indices_id = training_dataset_spec["index_id"]
    training_data_filters = training_dataset_spec["filters"]

    trainingdata_handler = TrainingDatasetHandler(training_dataset_spec["id"])

    given_reference_words = keyword_specification["given_reference_words"]
    weight_limit = float(keyword_specification["parameters"]["weight_limit"])
    max_number_of_key_words = int(keyword_specification["parameters"]["max_number_of_key_words"])


    keyword_seed_id = keyword_specification["seed_id"]
    raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec)

    # Calculate keywords

    # calculate posting lists for reference word groups
    posting_lists = posting_list_handler.get_seed_words_posting_lists(given_reference_words,index_directory,indices_id)
    posting_lists_id = raw_keywords_id
    cache.write("posting_lists",posting_lists_id,posting_lists)
    print("Merged posting lists calculated")

    top_dice_neighbours = __get_top_n_dice_neighbours(max_number_of_key_words,
                                                     posting_lists,
                                                     index_directory,
                                                     indices_id,
                                                     trainingdata_handler,
                                                     training_data_filters,
                                                     raw_keywords_id)

    new_reference_words, new_context_words = __seperate_reference_and_context_words(top_dice_neighbours, weight_limit)

    return {"raw_reference_words": new_reference_words, "raw_context_words":new_context_words}