Ejemplo n.º 1
0
def compare_dictionary(model_output_folder):
    # f = open(join(dirname(__file__), "logs", "crf", "new_word.txt"), "w")
    # f1 = open(join(dirname(__file__), "logs", "crf", "word_in_dictionary.txt"), "w")
    corpus = PlainTextCorpus()
    corpus.load(model_output_folder)
    new_words = []
    words = []
    for document in corpus.documents:
        for sentences in document.sentences:
            for word in sentences.split(" "):
                if '_' in word:
                    new_words.append(word)
    dictionary = viet_dict_11K.words
    for word in new_words:
        words.append(word.replace('_', ' '))
    new_word = [x for x in words if x not in dictionary]

    new_word = set(new_word)
    new_word = sorted(new_word)
    new_word_per_dict = float(len(new_word)) / float(len(dictionary)) * 100
    # f.write("Scale word not in dictionary %0.2f: \n" % new_word_per_dict)
    # for word in new_word:
    #     f.write(word.encode('utf-8') + "\n")
    word_in_dictionary = [x for x in words if x in dictionary]

    word_in_dictionary = set(word_in_dictionary)
    word_in_dictionary = sorted(word_in_dictionary)
    word_in_dictionary_per_total = float(len(word_in_dictionary)) / float(
        len(viet_dict_11K.words))
    # f1.write("scale word in dictionary: %0.2f \n" % word_in_dictionary_per_total)
    # for word in word_in_dictionary:
    #     f1.write(word.encode('utf-8') + "\n")
    return new_word, word_in_dictionary
Ejemplo n.º 2
0
 def find_punctuation(folder, file_name):
     f_write = open(file_name, "w")
     punctuations = []
     corpus = PlainTextCorpus()
     corpus.load(folder)
     punctuations = sorted(punctuations)
     for punctuation in punctuations:
         f_write.write(punctuation.encode("utf-8") + "\n")
     return punctuations
Ejemplo n.º 3
0
def get_data():
    output_folder = join(dirname(dirname(dirname(__file__))), "data",
                         "corpus_2", "test", "output")
    model_output_folder = join(dirname(dirname(dirname(__file__))), "data",
                               "corpus_2", "test", "output_%s" % model_name)
    expected_corpus = PlainTextCorpus()
    expected_corpus.load(output_folder)
    actual_corpus = PlainTextCorpus()
    actual_corpus.load(model_output_folder)
    return expected_corpus, actual_corpus
Ejemplo n.º 4
0
def segment_words(corpus_dir, target_dir):
    try:
        mkdir(target_dir)
    except:
        pass
    corpus = PlainTextCorpus()
    corpus.load(corpus_dir)
    existed_documents = listdir(target_dir)
    n_ignore = 0
    for document in corpus.documents:
        document_id = document.id
        if document_id not in existed_documents:
            print("Process %s" % document_id)
            sentences = document.sentences
            sentences = _.flatten(sentences)
            sentences = [tokenize(s).split(" . ") for s in sentences]
            sentences = _.flatten(sentences)
            segmented_sentences = [word_sent(s) for s in sentences if s not in [u""]]
            content = convert_to_text(segmented_sentences)
            filepath = join(target_dir, document.id)
            io.open(filepath, "w", encoding="utf8").write(content)
        else:
            n_ignore += 1
    print("Ignore %s documents" % n_ignore)
Ejemplo n.º 5
0
 def test_save(self):
     corpus = PlainTextCorpus()
     corpus.load(self.plaintext_folder)
     corpus.save(self.saved_plaintext_folder)
     files = listdir(self.saved_plaintext_folder)
     self.assertEqual(4, len(files))
     try:
         shutil.rmtree(self.saved_plaintext_folder)
     except Exception:
         pass
Ejemplo n.º 6
0
               "vi_ud_no_comment.conllu")
dev_file = join(dirname(dirname(dirname(__file__))), "data", "ud",
                "dev.conllu")
test_file = join(dirname(dirname(dirname(__file__))), "data", "ud",
                 "test.conllu")
# sample_ud_file = join("sample", "sample_vi_ud.conllu")
# ud_file = sample_ud_file


def extract_tokens(text):
    matched = re.match("(.*)/(.*)", text, re.UNICODE)
    return [matched.group(1), matched.group(2)]


if __name__ == '__main__':
    corpus = PlainTextCorpus()
    corpus.load(raw_folder)
    tagged_corpus = TaggedCorpus()
    tagged_documents = []
    documents = corpus.documents
    # documents = random.sample(documents, 10)
    for document in documents:
        sentences = []
        for sentence in document.sentences:
            tagged_tokens = sentence.split()
            tagged_words = [extract_tokens(token) for token in tagged_tokens]
            tagged_words = [
                TaggedWord(tagged_word[0].replace("_", " "), tagged_word[1])
                for tagged_word in tagged_words
            ]
            sentence = TaggedSentence(tagged_words)
Ejemplo n.º 7
0
from collections import Counter

import pandas as pd

from analyze_characters import get_utf8_number, get_unicode_number
from underthesea.corpus import PlainTextCorpus
from vietnamese_normalize import vietnamese_normalize

corpus = PlainTextCorpus()
# corpus_dir = "D:\\PycharmProjects\\underthesea\\corpus.vinews\\vn_news\\data"
corpus_dir = "D:\\PycharmProjects\\_NLP_DATA\\vlsp 2016\\sa\\SA2016-training_data"
corpus_dir = "D:\\PycharmProjects\\1link\\opinion_mining\\data\\foody"

corpus.load(corpus_dir)

sentences = sum([d.sentences for d in corpus.documents], [])
text = u" ".join(sentences[:200])
text = vietnamese_normalize(text)
counter = Counter(text)
df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
df.columns = ["character", "freq"]
df["unicode"] = df.apply(lambda row: get_unicode_number(row["character"]),
                         axis=1)
df["utf-8"] = df.apply(lambda row: get_utf8_number(row["character"]), axis=1)
df = df.sort_values(["freq"], ascending=False)
df.to_excel("analyze.xlsx", index=False)
corpus_character_sets = set(df["character"])


def load_known_characters():
    files = ["tcvn_6909_2001.xlsx", "other_characters.xlsx"]
Ejemplo n.º 8
0
# -*- coding: utf-8 -*-
from os.path import dirname
from os.path import join
import time

from model import CRFModel
from underthesea.corpus import PlainTextCorpus

start = time.time()
input_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus",
                    "test", "input")
output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "corpus",
                         "test", "output_crf")
# input_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "input")
# output_crf_folder = join(dirname(dirname(dirname(__file__))), "data", "test", "output")
corpus = PlainTextCorpus()
corpus.load(input_folder)
output = PlainTextCorpus()
model = CRFModel()
for document in corpus.documents:
    print document.id
    sentences = document.sentences
    output = []
    for sentence in sentences:
        sentence = model.predict(sentence)
        output.append(sentence)

    document.sentences = output

count = 0
for document in corpus.documents:
Ejemplo n.º 9
0
 def test_unicode(self):
     corpus = PlainTextCorpus()
     corpus.load(self.plaintext_folder)
     self.assertEqual(unicode, type(corpus.documents[0].sentences[0]))
Ejemplo n.º 10
0
 def test_sentences(self):
     corpus = PlainTextCorpus()
     corpus.load(self.plaintext_folder)
     self.assertEqual(list, type(corpus.documents[0].sentences))
Ejemplo n.º 11
0
 def test_load(self):
     corpus = PlainTextCorpus()
     corpus.load(self.plaintext_folder)
     self.assertEqual(4, len(corpus.documents))
Ejemplo n.º 12
0
 def test___init__(self):
     corpus = PlainTextCorpus()
     self.assertIsNone(corpus.documents)