コード例 #1
0
    def test_get_no_reference_words_in_context_words(self):
        r= "reference_words"
        c="context_words"
        group_0 = "0"

        test_setup ={r:{"cats":{ group_0 :["cat","cats"]}},c:{"cats":{group_0:["cat","mjao"]}}}
        expected = {r:{"cats":{ group_0 :["cat","cats"]}},c:{"cats":{group_0:["mjao"]}}}
        res = keyword_setup_handler.get_no_reference_word_in_context_words_setup(test_setup)
        expected[c]["cats"][group_0].sort()
        expected[r]["cats"][group_0].sort()
        res[r]["cats"][group_0].sort()
        res[c]["cats"][group_0].sort()
        self.assertEqual(expected, res)
コード例 #2
0
def do_categorization(experiment_spec):
    experiment_id = experiment_spec["id"]
    if cache.in_cache(__CATEGORIZATIONS_CACHE, experiment_id):
        print("categorization stored in cache for exeperiment " + experiment_id)
        print_to_json(cache.load(__CATEGORIZATIONS_CACHE, experiment_id))
        return

    categorization_method_name = experiment_spec["catgorization_method"]
    keyword_setup_id = experiment_spec["keywords"]["setup_id"]
    keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id, experiment_spec["training_dataset"])
    keywords = cache.load(__KEYWORD_DIRECTORY_CACHE, keyword_id)

    if categorization_method_name == "grep":
        keywords = keyword_setup_handler.get_no_reference_word_in_context_words_setup(keywords)
        reference_words = keywords["reference_words"]
        context_words = keywords["context_words"]
        freq_dist_map_id = document_vectorization.get_freq_dist_map_id(experiment_spec)
        freq_dists = cache.load(__FREQ_DIST_CACHE, freq_dist_map_id)
        categorization = text_categorizer.get_categorization(
            categorization_method_name, freq_dists, reference_words, context_words
        )
        cache.write(__CATEGORIZATIONS_CACHE, experiment_id, categorization)
        pprint.pprint(categorization)

        return

    if categorization_method_name == "cosinus":
        keywords = keyword_setup_handler.get_all_reference_word_in_context_word_setup(keywords)
        reference_words = keywords["reference_words"]
        context_words = keywords["context_words"]
        tf_idf_map_id = document_vectorization.get_tf_idf_map_id(experiment_spec)
        tf_idf_map = cache.load(__TF_IDF_DIRECTORY_CACHE, tf_idf_map_id)
        categorization = text_categorizer.get_cosinus_categorization(tf_idf_map, reference_words, context_words)
        pprint.pprint(categorization)
        cache.write(__CATEGORIZATIONS_CACHE, experiment_id, categorization)
        return

    raise NotImplemented()