def prepare_gold_standard_categorization(experiment_spec, gold_standard_categorization_directory):
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    if cache.in_cache(gold_standard_categorization_directory,test_data_id):
        print("Gold standard categorization in cache: "+ test_data_id)
        return
    test_dataset_handler = TestDatasetHandler(experiment_spec["test_dataset"]["id"])
    gold_standard_categorization = test_dataset_handler.get_gold_standard_categorization()
    pprint.pprint(gold_standard_categorization)
    cache.write(gold_standard_categorization_directory,test_data_id,gold_standard_categorization)
def prepare_freq_dists(experiment_spec, freq_dists_cache_directory):
    freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec)
    if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id):
        print( "FREQDISTS stored in cache: " + freq_dist_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    index_types = ["word", "bigram", "trigram"]
    freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types)
    pprint.pprint(freq_dist_map)
    cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
def prepare_tf_idf_vectors(experiment_spec,tf_idf_cache_dirctory,index_cache_directory):
    # Create test data handler

    tf_idf_vector_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec)
    if cache.in_cache(tf_idf_cache_dirctory, tf_idf_vector_map_id):
        print( "TF_IDF_VECTORS stored in cache: " + tf_idf_vector_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_docuement_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    print("test data preprocessed")
    index_id_index_type_map = __get_index_id_index_type(experiment_spec["training_dataset"])
    index_types = ["word", "bigram", "trigram"]
    max_freq_map = index_factory.create_max_freq_term_by_index_types(test_docuement_term_map, index_types)
    print("max_freq_map_calculated")
    tf_idf_vector_map = document_vectorization.get_docs_id_tf_idf_map(test_docuement_term_map, index_id_index_type_map, index_cache_directory,max_freq_map)
    pprint.pprint(tf_idf_vector_map)
    cache.write(tf_idf_cache_dirctory, tf_idf_vector_map_id, tf_idf_vector_map)