def test_get_tftf(self): data = self.test_pos_tag() data = ngram(data) data = [' '.join(d) for d in data] print(data) TFTF_class = TFTF(data) print(TFTF_class.get_word()) print(TFTF_class.get_train_tfTF()) print(TFTF_class.get_train_TF()) test = [ 'first first document', 'This is the first document.', 'second Is one' ] test = tokenizer(test) # rint(test) test = pos_tagging(test) test = ngram(test) test = [' '.join(d) for d in test] print(test) print(TFTF_class.get_test_tfTF(test)) print(TFTF_class.get_train_TF())
def bag_of_words_train(train_data): """ input: tokens""" print("bag_of_words_train") gramed_data = ngram(train_data, 1) join_data = [' '.join(d) for d in gramed_data] train_tfTF, TF, tf_vocab = tfTF_train(join_data, word_ngram=True) return train_tfTF, TF, tf_vocab
def word_trigram_train(train_data): """ input: tokens""" print("word_trigram_train") gramed_data = ngram(train_data, 3) join_data = [' '.join(d) for d in gramed_data] train_tfTF, TF, tf_vocab = tfTF_train(join_data, word_ngram=True, gram_num=3) return train_tfTF, TF, tf_vocab
def bag_of_words_test(test_data, TF, tf_vocab): """ input: tokens (已经tokenizer的)""" print("bag_of_words_test") assert TF is not None, u"测试阶段,TF不能为None" assert tf_vocab is not None, u"测试阶段,tf_vocab不能为None" gramed_data = ngram(test_data, 1) join_data = [' '.join(d) for d in gramed_data] test_tfTF = tfTF_test(join_data, TF, tf_vocab, word_ngram=True) return test_tfTF
def pos_tagger2(tagged_data, label): """ input: tokens """ print("pos_tagger2") result = [] gramed_data = ngram(tagged_data, n=2, join_char='_') for i in gramed_data: temp = 0 for j in i: if j == label: temp += 1 result.append(temp) return np.array(result).reshape(-1, 1)
def pos_gram_test(tagged_data, TF, tf_vocab, gram): """ input: tokens (已经tokenizer的)""" print("pos_bigram_test") assert TF is not None, u"测试阶段,TF不能为None" assert tf_vocab is not None, u"测试阶段,tf_vocab不能为None" # 2. 组成2-gram gramed_data = ngram(tagged_data, gram) # 3. 计算tfTF join_data = [' '.join(d) for d in gramed_data] test_tfTF = tfTF_test(join_data, TF, tf_vocab, word_ngram=False) return test_tfTF
def pos_gram_train(tagged_data, gram): """ input: tokens""" print("pos_bigram_train") # 2. 组成2-gram gramed_data = ngram(tagged_data, gram) join_data = [' '.join(d) for d in gramed_data] train_tfTF, TF, tf_vocab = tfTF_train(join_data, word_ngram=False, gram_num=gram) # print("pos",gram,train_tfTF.shape) return train_tfTF, TF, tf_vocab
def good_pos_ngrams(tagged_data, gram=2): """ input: tokens """ print("good_pos_ngrams") if (os.path.isfile(NGRAM_PATH)): good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb')) else: good_pos_ngrams = [ 'NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT', '. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP', 'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO', 'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP', 'TO NNP', 'TO NNP NNP' ] # 2. 组成2-gram gramed_data = ngram(tagged_data, gram, join_char=' ') correct_result = [] uncorrect_result = [] for essay in gramed_data: correct = 0 uncorrect = 0 for gram in essay: if gram in good_pos_ngrams: correct += 1 else: uncorrect += 1 correct_result.append(correct) uncorrect_result.append(uncorrect) return np.array(correct_result).reshape( -1, 1), np.array(uncorrect_result).reshape(-1, 1)
def test_bigram(self): data = self.test_pos_tag() data = ngram(data) return data