Ejemplos de ViPosTagger en Python, ejemplos de pyvi.pyvi.ViPosTagger en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: unibig_balance_db.py Proyecto: trangnt08/question_classification

def review_to_words(review, filename):
    """
    Function to convert a raw review to a string of words
    :param review
    :return: meaningful_words
    """
    # 1. Convert to lower case, split into individual words
    # words = review.lower().split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS
    words = review.split()
    # 2. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in array]

    # 4. Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(b, encoding='utf-8')))  # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)

Ejemplo n.º 2

0

Mostrar archivo

def remove_stop_postag(dataset, output_dir):
    utils.mkdir(output_dir)
    stack = os.listdir(dataset)
    # print 'loading data in ' + dataset
    total_doc = 0
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as fr:
                data = unicodedata.normalize('NFKC', fr.read().strip())
                original_content = tokenizer.predict(data)
                content = map(lambda x: ViPosTagger.postagging(x),
                              spliter.split(original_content))
                clean_content = []
                for info in content:
                    sen = []
                    for i in xrange(len(info[0])):
                        if is_exist(info[1][i]):
                            sen.append(info[0][i])
                    clean_content.append(u' '.join(sen))
                with open(os.path.join(output_dir, os.path.basename(file_name)),
                          'w', encoding='utf-8') as fw:
                    if len(clean_content) > 0:
                        fw.write(u'\n'.join(clean_content))
                    else: fw.write(original_content)
                total_doc += 1

Ejemplo n.º 3

0

Mostrar archivo

Archivo: rf_pos_1_2.py Proyecto: trangnt08/q_class_vi

def load_data(filename):
    col1 = []
    col2 = []
    col3 = []
    col4 = []
    with open(filename, 'r') as f:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = question.replace("\n", "")
            s1 = ViPosTagger.postagging(unicode(
                question, encoding='utf-8'))  # gan nhan POS
            r1 = []
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)

            z = ' '.join(r1)
            col4.append(z)
            col1.append(label1)
            col2.append(label2)
            # col3.append(question)

        d = {"label1": col1, "label2": col2, "question": col4}

        train = pd.DataFrame(d)
        if filename == 'datavn/train':
            joblib.dump(train, 'model_pos/train_rf_pos12.pkl')
        else:
            joblib.dump(train, 'model_pos/test_rf_pos12.pkl')
    return train

Ejemplo n.º 4

0

Mostrar archivo

Archivo: uni_big_pos.py Proyecto: trangnt08/question_classification

def load_data(filename, dict):
    res = []
    col1 = []
    col2 = []
    col3 = []
    col4 = []

    with open(filename, 'r') as f, open(dict, "w") as f2:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = review_to_words(question,
                                       'datavn/question_stopwords.txt')
            # question = review_add_pos(question,'datavn/question_stopwords.txt')
            col1.append(label1)
            col2.append(label2)
            col3.append(question)

        ngram = ngrams_array(col3,
                             2)  # tu dien cac tu va so lan xuat hien cua no
        dict_arr = []  # list cac tu co tan suat < 1
        for x in ngram:
            p = ngram.get(x)
            if p < 1:
                dict_arr.append(x)
                f2.write(x + "\n")
        col4 = []
        for q in col3:
            r1 = []
            r2 = []
            q = review_to_words2(q, dict, 2)  # q la 1 cau
            q1 = [' '.join(x) for x in ngrams(q, 1)]  # q1:mang cac 1-grams
            s1 = ViPosTagger.postagging(
                ViTokenizer.tokenize(unicode(
                    q, encoding='utf-8')))  # gan nhan POS
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)
            s2 = ' '.join(
                i
                for i in s1[1])  # Nhan tu loai cua cau dang str. vd: "N V E N"
            q2 = [' '.join(x) for x in ngrams(q, 2)
                  ]  # q2: mang cac phan tu 2-grams la word
            s22 = [' '.join(x) for x in ngrams(s2, 2)
                   ]  # s22: mang cac phan tu 2-grams la tag
            q3 = (' '.join(x.replace(' ', '_') for x in q2)).split()
            s3 = (' '.join(x.replace(' ', '_') for x in s22)).split()
            for i1, i2 in zip(q3, s3):
                t2 = i1 + "_" + i2
                r2.append(t2)
            y = r1 + r2
            # z1 = [' '.join(x) for x in y]
            z = ' '.join(y)
            col4.append(z)
            # col4.append(q)
        d = {"label1": col1, "label2": col2, "question": col4}
        train = pd.DataFrame(d)
    return train

Ejemplo n.º 5

0

Mostrar archivo

def review_add_pos(review, filename):
    words = review.split()
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(ViTokenizer.tokenize(unicode(b,encoding='utf-8')))   # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)

Ejemplo n.º 6

0

Mostrar archivo

def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])

Ejemplo n.º 7

0

Mostrar archivo

Archivo: get_seed_words.py Proyecto: dangtrunganh/text_classification

def count_tokens():
    print('count tokens...')
    statistic = {name: {} for name in my_map.name2label.keys()}
    stack = os.listdir(tokenized_dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(tokenized_dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-8') as fp:
                label = utils.get_dir_name(file_path)
                for sen in fp:
                    sen = sen.strip()
                    tag = ViPosTagger.postagging(sen)
                    tokens = [
                        tag[0][i] for i in xrange(len(tag[0]))
                        if tag[1][i] == u'N'
                    ]
                    update_count_tokens(statistic, label, tokens)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: ner_crf_no_accent.py Proyecto: tungct/vi-webot

def ner_crf(question):
    text = ViPosTagger.postagging(question)
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('./adapter/crf_ner_no_accent/crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred

Ejemplo n.º 9

0

Mostrar archivo

def detect_entity(question):
    text = ViPosTagger.postagging(ViTokenizer.tokenize(question))
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('./adapter/ner_crf/crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        k = k.replace("_", " ")
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred

Ejemplo n.º 10

0

Mostrar archivo

 def __init__(self):
     self.tokenizer = ViTokenizer()
     self.pos_tagger = ViPosTagger()

Ejemplo n.º 11

0

Mostrar archivo

# -*- coding: utf8 -*-
import codecs
import sys

sys.stdout = codecs.getwriter('utf_8')(sys.stdout)

sys.stdin = codecs.getreader('utf_8')(sys.stdin)
import pyvi
from pyvi.pyvi import ViTokenizer, ViPosTagger

# print(ViTokenizer.tokenize(u"Tôi ăn xôi xéo"))

output = ViPosTagger.postagging(ViTokenizer.tokenize(u"Tôi ăn rất nhiều cơm"))
print(output[0][0])

Ejemplo n.º 12

0

Mostrar archivo

from pyvi.pyvi import ViTokenizer, ViPosTagger

# with open('test.txt', 'r') as f:
#     tf = f.read().splitlines()
# l = []
# ar = []
# for i in range(len(tf)):
#     if tf[i] != "":
#         a = tf[i].split(' ')
#         l.append(tuple(a))
# ar.append(l)
# print(ar)

with open('predict.txt', 'r') as f:
    tf = f.read()
    text = ViPosTagger.postagging(ViTokenizer.tokenize(tf))
    test = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    test.append(ar)


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {

Ejemplo n.º 13

0

Mostrar archivo

def extract(
    id          ="text",
    content     ="text",
    part_index  ="int",
    chap_index  ="int",
    sec_index   ="int",
    law_index   ="int",
    item_index  ="int",
    start_index ="int",
    end_index   ="int",
    ):
    sent_index = 0
    for s in content[start_index:end_index].split("\n"):
        if s != "":
            it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I)
            lent = divlaw.lenIterator(it)
            it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I)
            listIndex = []
            position = 0
            if item_index is None:
                position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,0) 
            else :
                position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,item_index+1) 
            if lent > 0:
                for i in it :
                    listIndex.append(i.start())
                if (len(s) - i.end()) > 5 :
                    listIndex.append(i.end())
                    lent += 1
            else :
                listIndex.append(0)
            for j in range(0,lent) :
                if (j != (lent - 1)) :
                    string = handle_string.to_unicode(s[listIndex[j]:listIndex[j+1]])
                    string = string.replace("\\",'')
                    tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] 
                    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1]
                    tk = []
                    sent_index += 1
                    for token in tokenize :
                        token = token.encode('utf-8')
                        tk.append(token)
                    if '' in tk :
                        continue
                    else :
                        yield [
                        id,
                        position,
                        sent_index - 1,
                        " ".join(tk),
                        tk,
                        pos_tag
                        ]
                else :
                    string = handle_string.to_unicode(s[listIndex[j]:])
                    string = string.replace("\\",'')
                    tokenize =  ViPosTagger.postagging(ViTokenizer.tokenize(string))[0]
                    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1]
                    tk = []
                    sent_index+=1
                    for token in tokenize :
                        token = token.encode('utf-8')
                        tk.append(token)
                    if '' in tk :
                        continue
                    else :
                        yield [
                            id,
                            position,
                            sent_index -1,
                            " ".join(tk),
                            tk,
                            pos_tag
                        ]

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test.py Proyecto: trangnt08/intent_en

# -*- encoding: utf8 -*-
from pyvi.pyvi import ViPosTagger, ViTokenizer

a = ViPosTagger.postagging(
    ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội"))
print a

Ejemplo n.º 15

0

Mostrar archivo

Archivo: uni_big_pos.py Proyecto: trangnt08/question_classification

def add_pos_sen(review):
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_pos.py Proyecto: tothanhtung0205/CRFs_NER

# -*- coding=utf-8 -*-
from pyvi.pyvi import ViTokenizer, ViPosTagger
import requests

test_sent = "Đệ nhất phu nhân Mỹ Melania Trump cảm ơn Chelsea Clinton, con gái của cựu Tổng thống Bill Clinton và cựu Ngoại trưởng Hillary Clinton, vì đã lên tiếng bênh vực con trai 11 tuổi của bà trước những ý kiến trái chiều của dư luận."

url = "http://ai.topica.vn:9119/get_mlbka"

headers = {
    'cache-control': "no-cache",
    'postman-token': "dd327f89-2a5f-bf16-c115-590b590e32c3"
}

response = requests.request("POST", url, data=test_sent, headers=headers)

tach_tu_anh_son = response.text

postag_as = ViPosTagger.postagging(tach_tu_anh_son)
y = zip(postag_as[0], postag_as[1])
print repr(y).decode('unicode-escape')

postaged_sent = ViPosTagger.postagging(
    ViTokenizer.tokenize(test_sent.decode('utf-8')))
x = zip(postaged_sent[0], postaged_sent[1])
print repr(x).decode('unicode-escape')