def get_top_distinguishing(input_file,
                           other_file_list,
                           data_dir,
                           output_file,
                           vocab_size=100):
    bigram_file = "%s/bigram_phrases.txt" % data_dir
    if not os.path.exists(bigram_file):
        wc.find_bigrams(input_file, bigram_file)
    bigram_dict = wc.load_bigrams(bigram_file)
    word_cnts = wc.get_word_count(input_file,
                                  bigram_dict=bigram_dict,
                                  words_func=wc.get_mixed_tokens)
    other_cnts = collections.defaultdict(int)
    for filename in other_file_list:
        tmp_cnts = wc.get_word_count(filename,
                                     bigram_dict=bigram_dict,
                                     words_func=wc.get_mixed_tokens)
        for w in tmp_cnts:
            other_cnts[w] += tmp_cnts[w]
    alphas = get_informative_alpha(word_cnts, other_cnts)
    word_score = log_odds_normalized_diff(word_cnts, other_cnts, alphas)
    vocab_dict = wc.get_word_dict(word_score,
                                  top=vocab_size,
                                  filter_regex="\w\w+")
    utils.write_word_dict(vocab_dict, word_cnts, output_file)
Beispiel #2
0
def get_mallet_input_from_words(input_file, data_dir, vocab_size=10000):
    bigram_file = "%s/bigram_phrases.txt" % data_dir
    if not os.path.exists(bigram_file):
        wc.find_bigrams(input_file, bigram_file)
    bigram_dict = wc.load_bigrams(bigram_file)
    word_cnts = wc.get_word_count(input_file,
                                  bigram_dict=bigram_dict,
                                  words_func=wc.get_mixed_tokens)
    vocab_dict = wc.get_word_dict(word_cnts,
                                  top=vocab_size,
                                  filter_regex="\w\w+")
    utils.write_word_dict(vocab_dict, word_cnts,
                          "%s/data.word_id.dict" % data_dir)
    convert_word_count_mallet(
        vocab_dict,
        input_file,
        "%s/data.input" % data_dir,
        words_func=functools.partial(wc.get_mixed_tokens,
                                     bigram_dict=bigram_dict))
def get_mallet_input_from_words(input_file, data_dir, vocab_size=10000):
    bigram_file = "%s/bigram_phrases.txt" % data_dir
    if not os.path.exists(bigram_file):
        wc.find_bigrams(input_file, bigram_file)
    else:
        print("get_mallet_input_from_words: bigram file found at: {}, skipping".format(bigram_file))
    if os.path.exists("%s/data.word_id.dict" % data_dir) and os.path.exists("%s/data.input" % data_dir):
        print("get_mallet_input_from_words: both data.word_id.dict and data.input found, skipping")
        return
    bigram_dict = wc.load_bigrams(bigram_file)
    word_cnts = wc.get_word_count(input_file, bigram_dict=bigram_dict,
                                  words_func=wc.get_mixed_tokens)
    vocab_dict = wc.get_word_dict(word_cnts,
                                  top=vocab_size,
                                  filter_regex="\w\w+")
                                #   filter_regex=None)
    utils.write_word_dict(vocab_dict, word_cnts,
                          "%s/data.word_id.dict" % data_dir)
    convert_word_count_mallet(vocab_dict, input_file,
                              "%s/data.input" % data_dir,
                              words_func=functools.partial(
                                  wc.get_mixed_tokens,
                                  bigram_dict=bigram_dict))
Beispiel #4
0
 def test2(self):
     self.assertEqual(word_count.get_word_count('Word'), 1)
Beispiel #5
0
 def test1(self):
     self.assertEqual(word_count.get_word_count('This is a sentence'), 4)
Beispiel #6
0
 def test_three(self):
     assert word_count.get_word_count('     ') == 0
Beispiel #7
0
 def test_two(self):
     assert word_count.get_word_count('Word') == 1
Beispiel #8
0
 def test_one(self):
     assert word_count.get_word_count('This is a sentence') == 4
Beispiel #9
0
 def test3(self):
     self.assertEqual(word_count.get_word_count('     '), 0)