def compare_dictionary(model_output_folder): # f = open(join(dirname(__file__), "logs", "crf", "new_word.txt"), "w") # f1 = open(join(dirname(__file__), "logs", "crf", "word_in_dictionary.txt"), "w") corpus = PlainTextCorpus() corpus.load(model_output_folder) new_words = [] words = [] for document in corpus.documents: for sentences in document.sentences: for word in sentences.split(" "): if '_' in word: new_words.append(word) dictionary = viet_dict_11K.words for word in new_words: words.append(word.replace('_', ' ')) new_word = [x for x in words if x not in dictionary] new_word = set(new_word) new_word = sorted(new_word) new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100 # f.write("Scale word not in dictionary %0.2f: \n" % new_word_per_dict) # for word in new_word: # f.write(word.encode('utf-8') + "\n") word_in_dictionary = [x for x in words if x in dictionary] word_in_dictionary = set(word_in_dictionary) word_in_dictionary = sorted(word_in_dictionary) word_in_dictionary_per_total = float(len(word_in_dictionary)) / float( len(viet_dict_11K.words)) # f1.write("scale word in dictionary: %0.2f \n" % word_in_dictionary_per_total) # for word in word_in_dictionary: # f1.write(word.encode('utf-8') + "\n") return new_word, word_in_dictionary
def find_punctuation(folder, file_name): f_write = open(file_name, "w") punctuations = [] corpus = PlainTextCorpus() corpus.load(folder) punctuations = sorted(punctuations) for punctuation in punctuations: f_write.write(punctuation.encode("utf-8") + "\n") return punctuations
def get_data(): output_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus_2", "test", "output") model_output_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus_2", "test", "output_%s" % model_name) expected_corpus = PlainTextCorpus() expected_corpus.load(output_folder) actual_corpus = PlainTextCorpus() actual_corpus.load(model_output_folder) return expected_corpus, actual_corpus
def segment_words(corpus_dir, target_dir): try: mkdir(target_dir) except: pass corpus = PlainTextCorpus() corpus.load(corpus_dir) existed_documents = listdir(target_dir) n_ignore = 0 for document in corpus.documents: document_id = document.id if document_id not in existed_documents: print("Process %s" % document_id) sentences = document.sentences sentences = _.flatten(sentences) sentences = [tokenize(s).split(" . ") for s in sentences] sentences = _.flatten(sentences) segmented_sentences = [word_sent(s) for s in sentences if s not in [u""]] content = convert_to_text(segmented_sentences) filepath = join(target_dir, document.id) io.open(filepath, "w", encoding="utf8").write(content) else: n_ignore += 1 print("Ignore %s documents" % n_ignore)
def test_save(self): corpus = PlainTextCorpus() corpus.load(self.plaintext_folder) corpus.save(self.saved_plaintext_folder) files = listdir(self.saved_plaintext_folder) self.assertEqual(4, len(files)) try: shutil.rmtree(self.saved_plaintext_folder) except Exception: pass
"vi_ud_no_comment.conllu") dev_file = join(dirname(dirname(dirname(__file__))), "data", "ud", "dev.conllu") test_file = join(dirname(dirname(dirname(__file__))), "data", "ud", "test.conllu") # sample_ud_file = join("sample", "sample_vi_ud.conllu") # ud_file = sample_ud_file def extract_tokens(text): matched = re.match("(.*)/(.*)", text, re.UNICODE) return [matched.group(1), matched.group(2)] if __name__ == '__main__': corpus = PlainTextCorpus() corpus.load(raw_folder) tagged_corpus = TaggedCorpus() tagged_documents = [] documents = corpus.documents # documents = random.sample(documents, 10) for document in documents: sentences = [] for sentence in document.sentences: tagged_tokens = sentence.split() tagged_words = [extract_tokens(token) for token in tagged_tokens] tagged_words = [ TaggedWord(tagged_word[0].replace("_", " "), tagged_word[1]) for tagged_word in tagged_words ] sentence = TaggedSentence(tagged_words)
from collections import Counter import pandas as pd from analyze_characters import get_utf8_number, get_unicode_number from underthesea.corpus import PlainTextCorpus from vietnamese_normalize import vietnamese_normalize corpus = PlainTextCorpus() # corpus_dir = "D:\\PycharmProjects\\underthesea\\corpus.vinews\\vn_news\\data" corpus_dir = "D:\\PycharmProjects\\_NLP_DATA\\vlsp 2016\\sa\\SA2016-training_data" corpus_dir = "D:\\PycharmProjects\\1link\\opinion_mining\\data\\foody" corpus.load(corpus_dir) sentences = sum([d.sentences for d in corpus.documents], []) text = u" ".join(sentences[:200]) text = vietnamese_normalize(text) counter = Counter(text) df = pd.DataFrame.from_dict(counter, orient='index').reset_index() df.columns = ["character", "freq"] df["unicode"] = df.apply(lambda row: get_unicode_number(row["character"]), axis=1) df["utf-8"] = df.apply(lambda row: get_utf8_number(row["character"]), axis=1) df = df.sort_values(["freq"], ascending=False) df.to_excel("analyze.xlsx", index=False) corpus_character_sets = set(df["character"]) def load_known_characters(): files = ["tcvn_6909_2001.xlsx", "other_characters.xlsx"]
# -*- coding: utf-8 -*- from os.path import dirname from os.path import join import time from model import CRFModel from underthesea.corpus import PlainTextCorpus start = time.time() input_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "test", "input") output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus", "test", "output_crf") # input_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "input") # output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "output") corpus = PlainTextCorpus() corpus.load(input_folder) output = PlainTextCorpus() model = CRFModel() for document in corpus.documents: print document.id sentences = document.sentences output = [] for sentence in sentences: sentence = model.predict(sentence) output.append(sentence) document.sentences = output count = 0 for document in corpus.documents:
def test_unicode(self): corpus = PlainTextCorpus() corpus.load(self.plaintext_folder) self.assertEqual(unicode, type(corpus.documents[0].sentences[0]))
def test_sentences(self): corpus = PlainTextCorpus() corpus.load(self.plaintext_folder) self.assertEqual(list, type(corpus.documents[0].sentences))
def test_load(self): corpus = PlainTextCorpus() corpus.load(self.plaintext_folder) self.assertEqual(4, len(corpus.documents))
def test___init__(self): corpus = PlainTextCorpus() self.assertIsNone(corpus.documents)