def prepare_freq_dists(experiment_spec, freq_dists_cache_directory):
    freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec)
    if cache.in_cache(freq_dists_cache_directory, freq_dist_map_id):
        print( "FREQDISTS stored in cache: " + freq_dist_map_id)
        return
    test_data_id = dataset_id_handler.get_test_data_id(experiment_spec)
    preprocessing_filter_names =  experiment_spec["training_dataset"]["filters"]
    test_document_term_map = document_vectorization.get_test_document_term_map(test_data_id,preprocessing_filter_names)
    index_types = ["word", "bigram", "trigram"]
    freq_dist_map = document_vectorization.get_freq_dists_map(test_document_term_map,index_types)
    pprint.pprint(freq_dist_map)
    cache.write(freq_dists_cache_directory,freq_dist_map_id,freq_dist_map)
 def test_get_freq_dists_map(self):
     test_documents = {1: "test document test dogs", 2: "test document test test cats"}
     test_document_terms = dict([(doc_id, nltk.word_tokenize(test_documents[doc_id])) for doc_id in test_documents])
     index_types = ["word", "bigram", "trigram"]
     res = document_vectorization.get_freq_dists_map(test_document_terms, index_types)
     expected_word_freq_doc_2 = {"test": 3, "document": 1, "cats": 1}
     self.assertEqual(res[2]["word"], expected_word_freq_doc_2)
     expected = {
         1: {
             "bigram": {"document_test": 1, "test_document": 1, "test_dogs": 1},
             "trigram": {"document_test_dogs": 1, "test_document_test": 1},
             "word": {"document": 1, "dogs": 1, "test": 2},
         },
         2: {
             "bigram": {"document_test": 1, "test_cats": 1, "test_document": 1, "test_test": 1},
             "trigram": {"document_test_test": 1, "test_document_test": 1, "test_test_cats": 1},
             "word": {"cats": 1, "document": 1, "test": 3},
         },
     }
     self.assertEqual(res, expected)