Esempi in Python per ViPosTagger.postagging, esempi in Python per pyvi.pyvi.ViPosTagger.postagging

Esempio n. 1

0

Mostra file

File: unibig_balance_db.py Progetto: trangnt08/question_classification

def review_to_words(review, filename):
    """
    Function to convert a raw review to a string of words
    :param review
    :return: meaningful_words
    """
    # 1. Convert to lower case, split into individual words
    # words = review.lower().split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS
    words = review.split()
    # 2. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in array]

    # 4. Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(b, encoding='utf-8')))  # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)

Esempio n. 2

0

Mostra file

def remove_stop_postag(dataset, output_dir):
    utils.mkdir(output_dir)
    stack = os.listdir(dataset)
    # print 'loading data in ' + dataset
    total_doc = 0
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as fr:
                data = unicodedata.normalize('NFKC', fr.read().strip())
                original_content = tokenizer.predict(data)
                content = map(lambda x: ViPosTagger.postagging(x),
                              spliter.split(original_content))
                clean_content = []
                for info in content:
                    sen = []
                    for i in xrange(len(info[0])):
                        if is_exist(info[1][i]):
                            sen.append(info[0][i])
                    clean_content.append(u' '.join(sen))
                with open(os.path.join(output_dir, os.path.basename(file_name)),
                          'w', encoding='utf-8') as fw:
                    if len(clean_content) > 0:
                        fw.write(u'\n'.join(clean_content))
                    else: fw.write(original_content)
                total_doc += 1

Esempio n. 3

0

Mostra file

File: rf_pos_1_2.py Progetto: trangnt08/q_class_vi

def load_data(filename):
    col1 = []
    col2 = []
    col3 = []
    col4 = []
    with open(filename, 'r') as f:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = question.replace("\n", "")
            s1 = ViPosTagger.postagging(unicode(
                question, encoding='utf-8'))  # gan nhan POS
            r1 = []
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)

            z = ' '.join(r1)
            col4.append(z)
            col1.append(label1)
            col2.append(label2)
            # col3.append(question)

        d = {"label1": col1, "label2": col2, "question": col4}

        train = pd.DataFrame(d)
        if filename == 'datavn/train':
            joblib.dump(train, 'model_pos/train_rf_pos12.pkl')
        else:
            joblib.dump(train, 'model_pos/test_rf_pos12.pkl')
    return train

Esempio n. 4

0

Mostra file

File: uni_big_pos.py Progetto: trangnt08/question_classification

def load_data(filename, dict):
    res = []
    col1 = []
    col2 = []
    col3 = []
    col4 = []

    with open(filename, 'r') as f, open(dict, "w") as f2:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = review_to_words(question,
                                       'datavn/question_stopwords.txt')
            # question = review_add_pos(question,'datavn/question_stopwords.txt')
            col1.append(label1)
            col2.append(label2)
            col3.append(question)

        ngram = ngrams_array(col3,
                             2)  # tu dien cac tu va so lan xuat hien cua no
        dict_arr = []  # list cac tu co tan suat < 1
        for x in ngram:
            p = ngram.get(x)
            if p < 1:
                dict_arr.append(x)
                f2.write(x + "\n")
        col4 = []
        for q in col3:
            r1 = []
            r2 = []
            q = review_to_words2(q, dict, 2)  # q la 1 cau
            q1 = [' '.join(x) for x in ngrams(q, 1)]  # q1:mang cac 1-grams
            s1 = ViPosTagger.postagging(
                ViTokenizer.tokenize(unicode(
                    q, encoding='utf-8')))  # gan nhan POS
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)
            s2 = ' '.join(
                i
                for i in s1[1])  # Nhan tu loai cua cau dang str. vd: "N V E N"
            q2 = [' '.join(x) for x in ngrams(q, 2)
                  ]  # q2: mang cac phan tu 2-grams la word
            s22 = [' '.join(x) for x in ngrams(s2, 2)
                   ]  # s22: mang cac phan tu 2-grams la tag
            q3 = (' '.join(x.replace(' ', '_') for x in q2)).split()
            s3 = (' '.join(x.replace(' ', '_') for x in s22)).split()
            for i1, i2 in zip(q3, s3):
                t2 = i1 + "_" + i2
                r2.append(t2)
            y = r1 + r2
            # z1 = [' '.join(x) for x in y]
            z = ' '.join(y)
            col4.append(z)
            # col4.append(q)
        d = {"label1": col1, "label2": col2, "question": col4}
        train = pd.DataFrame(d)
    return train

Esempio n. 5

0

Mostra file

def review_add_pos(review, filename):
    words = review.split()
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(ViTokenizer.tokenize(unicode(b,encoding='utf-8')))   # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)

Esempio n. 6

0

Mostra file

def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])

Esempio n. 7

0

Mostra file

File: get_seed_words.py Progetto: dangtrunganh/text_classification

def count_tokens():
    print('count tokens...')
    statistic = {name: {} for name in my_map.name2label.keys()}
    stack = os.listdir(tokenized_dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(tokenized_dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-8') as fp:
                label = utils.get_dir_name(file_path)
                for sen in fp:
                    sen = sen.strip()
                    tag = ViPosTagger.postagging(sen)
                    tokens = [
                        tag[0][i] for i in xrange(len(tag[0]))
                        if tag[1][i] == u'N'
                    ]
                    update_count_tokens(statistic, label, tokens)

Esempio n. 8

0

Mostra file

File: ner_crf_no_accent.py Progetto: tungct/vi-webot

def ner_crf(question):
    text = ViPosTagger.postagging(question)
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('./adapter/crf_ner_no_accent/crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred

Esempio n. 9

0

Mostra file

def detect_entity(question):
    text = ViPosTagger.postagging(ViTokenizer.tokenize(question))
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('./adapter/ner_crf/crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        k = k.replace("_", " ")
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred

Esempio n. 10

0

Mostra file

# -*- coding: utf8 -*-
import codecs
import sys

sys.stdout = codecs.getwriter('utf_8')(sys.stdout)

sys.stdin = codecs.getreader('utf_8')(sys.stdin)
import pyvi
from pyvi.pyvi import ViTokenizer, ViPosTagger

# print(ViTokenizer.tokenize(u"Tôi ăn xôi xéo"))

output = ViPosTagger.postagging(ViTokenizer.tokenize(u"Tôi ăn rất nhiều cơm"))
print(output[0][0])

Esempio n. 11

0

Mostra file

from pyvi.pyvi import ViTokenizer, ViPosTagger

# with open('test.txt', 'r') as f:
#     tf = f.read().splitlines()
# l = []
# ar = []
# for i in range(len(tf)):
#     if tf[i] != "":
#         a = tf[i].split(' ')
#         l.append(tuple(a))
# ar.append(l)
# print(ar)

with open('predict.txt', 'r') as f:
    tf = f.read()
    text = ViPosTagger.postagging(ViTokenizer.tokenize(tf))
    test = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    test.append(ar)


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {

Esempio n. 12

0

Mostra file

def extract(
    id          ="text",
    content     ="text",
    part_index  ="int",
    chap_index  ="int",
    sec_index   ="int",
    law_index   ="int",
    item_index  ="int",
    start_index ="int",
    end_index   ="int",
    ):
    sent_index = 0
    for s in content[start_index:end_index].split("\n"):
        if s != "":
            it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I)
            lent = divlaw.lenIterator(it)
            it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I)
            listIndex = []
            position = 0
            if item_index is None:
                position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,0) 
            else :
                position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,item_index+1) 
            if lent > 0:
                for i in it :
                    listIndex.append(i.start())
                if (len(s) - i.end()) > 5 :
                    listIndex.append(i.end())
                    lent += 1
            else :
                listIndex.append(0)
            for j in range(0,lent) :
                if (j != (lent - 1)) :
                    string = handle_string.to_unicode(s[listIndex[j]:listIndex[j+1]])
                    string = string.replace("\\",'')
                    tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] 
                    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1]
                    tk = []
                    sent_index += 1
                    for token in tokenize :
                        token = token.encode('utf-8')
                        tk.append(token)
                    if '' in tk :
                        continue
                    else :
                        yield [
                        id,
                        position,
                        sent_index - 1,
                        " ".join(tk),
                        tk,
                        pos_tag
                        ]
                else :
                    string = handle_string.to_unicode(s[listIndex[j]:])
                    string = string.replace("\\",'')
                    tokenize =  ViPosTagger.postagging(ViTokenizer.tokenize(string))[0]
                    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1]
                    tk = []
                    sent_index+=1
                    for token in tokenize :
                        token = token.encode('utf-8')
                        tk.append(token)
                    if '' in tk :
                        continue
                    else :
                        yield [
                            id,
                            position,
                            sent_index -1,
                            " ".join(tk),
                            tk,
                            pos_tag
                        ]

Esempio n. 13

0

Mostra file

File: test.py Progetto: trangnt08/intent_en

# -*- encoding: utf8 -*-
from pyvi.pyvi import ViPosTagger, ViTokenizer

a = ViPosTagger.postagging(
    ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội"))
print a

Esempio n. 14

0

Mostra file

File: uni_big_pos.py Progetto: trangnt08/question_classification

def add_pos_sen(review):
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS

Esempio n. 15

0

Mostra file

File: test_pos.py Progetto: tothanhtung0205/CRFs_NER

# -*- coding=utf-8 -*-
from pyvi.pyvi import ViTokenizer, ViPosTagger
import requests

test_sent = "Đệ nhất phu nhân Mỹ Melania Trump cảm ơn Chelsea Clinton, con gái của cựu Tổng thống Bill Clinton và cựu Ngoại trưởng Hillary Clinton, vì đã lên tiếng bênh vực con trai 11 tuổi của bà trước những ý kiến trái chiều của dư luận."

url = "http://ai.topica.vn:9119/get_mlbka"

headers = {
    'cache-control': "no-cache",
    'postman-token': "dd327f89-2a5f-bf16-c115-590b590e32c3"
}

response = requests.request("POST", url, data=test_sent, headers=headers)

tach_tu_anh_son = response.text

postag_as = ViPosTagger.postagging(tach_tu_anh_son)
y = zip(postag_as[0], postag_as[1])
print repr(y).decode('unicode-escape')

postaged_sent = ViPosTagger.postagging(
    ViTokenizer.tokenize(test_sent.decode('utf-8')))
x = zip(postaged_sent[0], postaged_sent[1])
print repr(x).decode('unicode-escape')