def review_to_words(review, filename):
    """
    Function to convert a raw review to a string of words
    :param review
    :return: meaningful_words
    """
    # 1. Convert to lower case, split into individual words
    # words = review.lower().split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS
    words = review.split()
    # 2. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in array]

    # 4. Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(b, encoding='utf-8')))  # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)
Ejemplo n.º 2
0
def remove_stop_postag(dataset, output_dir):
    utils.mkdir(output_dir)
    stack = os.listdir(dataset)
    # print 'loading data in ' + dataset
    total_doc = 0
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as fr:
                data = unicodedata.normalize('NFKC', fr.read().strip())
                original_content = tokenizer.predict(data)
                content = map(lambda x: ViPosTagger.postagging(x),
                              spliter.split(original_content))
                clean_content = []
                for info in content:
                    sen = []
                    for i in xrange(len(info[0])):
                        if is_exist(info[1][i]):
                            sen.append(info[0][i])
                    clean_content.append(u' '.join(sen))
                with open(os.path.join(output_dir, os.path.basename(file_name)),
                          'w', encoding='utf-8') as fw:
                    if len(clean_content) > 0:
                        fw.write(u'\n'.join(clean_content))
                    else: fw.write(original_content)
                total_doc += 1
Ejemplo n.º 3
0
def load_data(filename):
    col1 = []
    col2 = []
    col3 = []
    col4 = []
    with open(filename, 'r') as f:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = question.replace("\n", "")
            s1 = ViPosTagger.postagging(unicode(
                question, encoding='utf-8'))  # gan nhan POS
            r1 = []
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)

            z = ' '.join(r1)
            col4.append(z)
            col1.append(label1)
            col2.append(label2)
            # col3.append(question)

        d = {"label1": col1, "label2": col2, "question": col4}

        train = pd.DataFrame(d)
        if filename == 'datavn/train':
            joblib.dump(train, 'model_pos/train_rf_pos12.pkl')
        else:
            joblib.dump(train, 'model_pos/test_rf_pos12.pkl')
    return train
def load_data(filename, dict):
    res = []
    col1 = []
    col2 = []
    col3 = []
    col4 = []

    with open(filename, 'r') as f, open(dict, "w") as f2:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = review_to_words(question,
                                       'datavn/question_stopwords.txt')
            # question = review_add_pos(question,'datavn/question_stopwords.txt')
            col1.append(label1)
            col2.append(label2)
            col3.append(question)

        ngram = ngrams_array(col3,
                             2)  # tu dien cac tu va so lan xuat hien cua no
        dict_arr = []  # list cac tu co tan suat < 1
        for x in ngram:
            p = ngram.get(x)
            if p < 1:
                dict_arr.append(x)
                f2.write(x + "\n")
        col4 = []
        for q in col3:
            r1 = []
            r2 = []
            q = review_to_words2(q, dict, 2)  # q la 1 cau
            q1 = [' '.join(x) for x in ngrams(q, 1)]  # q1:mang cac 1-grams
            s1 = ViPosTagger.postagging(
                ViTokenizer.tokenize(unicode(
                    q, encoding='utf-8')))  # gan nhan POS
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)
            s2 = ' '.join(
                i
                for i in s1[1])  # Nhan tu loai cua cau dang str. vd: "N V E N"
            q2 = [' '.join(x) for x in ngrams(q, 2)
                  ]  # q2: mang cac phan tu 2-grams la word
            s22 = [' '.join(x) for x in ngrams(s2, 2)
                   ]  # s22: mang cac phan tu 2-grams la tag
            q3 = (' '.join(x.replace(' ', '_') for x in q2)).split()
            s3 = (' '.join(x.replace(' ', '_') for x in s22)).split()
            for i1, i2 in zip(q3, s3):
                t2 = i1 + "_" + i2
                r2.append(t2)
            y = r1 + r2
            # z1 = [' '.join(x) for x in y]
            z = ' '.join(y)
            col4.append(z)
            # col4.append(q)
        d = {"label1": col1, "label2": col2, "question": col4}
        train = pd.DataFrame(d)
    return train
Ejemplo n.º 5
0
def review_add_pos(review, filename):
    words = review.split()
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(ViTokenizer.tokenize(unicode(b,encoding='utf-8')))   # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)
Ejemplo n.º 6
0
def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])
def count_tokens():
    print('count tokens...')
    statistic = {name: {} for name in my_map.name2label.keys()}
    stack = os.listdir(tokenized_dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(tokenized_dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-8') as fp:
                label = utils.get_dir_name(file_path)
                for sen in fp:
                    sen = sen.strip()
                    tag = ViPosTagger.postagging(sen)
                    tokens = [
                        tag[0][i] for i in xrange(len(tag[0]))
                        if tag[1][i] == u'N'
                    ]
                    update_count_tokens(statistic, label, tokens)
Ejemplo n.º 8
0
def ner_crf(question):
    text = ViPosTagger.postagging(question)
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('./adapter/crf_ner_no_accent/crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred
Ejemplo n.º 9
0
def detect_entity(question):
    text = ViPosTagger.postagging(ViTokenizer.tokenize(question))
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('./adapter/ner_crf/crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        k = k.replace("_", " ")
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred
Ejemplo n.º 10
0
 def __init__(self):
     self.tokenizer = ViTokenizer()
     self.pos_tagger = ViPosTagger()
Ejemplo n.º 11
0
# -*- coding: utf8 -*-
import codecs
import sys

sys.stdout = codecs.getwriter('utf_8')(sys.stdout)

sys.stdin = codecs.getreader('utf_8')(sys.stdin)
import pyvi
from pyvi.pyvi import ViTokenizer, ViPosTagger

# print(ViTokenizer.tokenize(u"Tôi ăn xôi xéo"))

output = ViPosTagger.postagging(ViTokenizer.tokenize(u"Tôi ăn rất nhiều cơm"))
print(output[0][0])
Ejemplo n.º 12
0
from pyvi.pyvi import ViTokenizer, ViPosTagger

# with open('test.txt', 'r') as f:
#     tf = f.read().splitlines()
# l = []
# ar = []
# for i in range(len(tf)):
#     if tf[i] != "":
#         a = tf[i].split(' ')
#         l.append(tuple(a))
# ar.append(l)
# print(ar)

with open('predict.txt', 'r') as f:
    tf = f.read()
    text = ViPosTagger.postagging(ViTokenizer.tokenize(tf))
    test = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    test.append(ar)


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
Ejemplo n.º 13
0
def extract(
    id          ="text",
    content     ="text",
    part_index  ="int",
    chap_index  ="int",
    sec_index   ="int",
    law_index   ="int",
    item_index  ="int",
    start_index ="int",
    end_index   ="int",
    ):
    sent_index = 0
    for s in content[start_index:end_index].split("\n"):
        if s != "":
            it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I)
            lent = divlaw.lenIterator(it)
            it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I)
            listIndex = []
            position = 0
            if item_index is None:
                position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,0) 
            else :
                position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,item_index+1) 
            if lent > 0:
                for i in it :
                    listIndex.append(i.start())
                if (len(s) - i.end()) > 5 :
                    listIndex.append(i.end())
                    lent += 1
            else :
                listIndex.append(0)
            for j in range(0,lent) :
                if (j != (lent - 1)) :
                    string = handle_string.to_unicode(s[listIndex[j]:listIndex[j+1]])
                    string = string.replace("\\",'')
                    tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] 
                    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1]
                    tk = []
                    sent_index += 1
                    for token in tokenize :
                        token = token.encode('utf-8')
                        tk.append(token)
                    if '' in tk :
                        continue
                    else :
                        yield [
                        id,
                        position,
                        sent_index - 1,
                        " ".join(tk),
                        tk,
                        pos_tag
                        ]
                else :
                    string = handle_string.to_unicode(s[listIndex[j]:])
                    string = string.replace("\\",'')
                    tokenize =  ViPosTagger.postagging(ViTokenizer.tokenize(string))[0]
                    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1]
                    tk = []
                    sent_index+=1
                    for token in tokenize :
                        token = token.encode('utf-8')
                        tk.append(token)
                    if '' in tk :
                        continue
                    else :
                        yield [
                            id,
                            position,
                            sent_index -1,
                            " ".join(tk),
                            tk,
                            pos_tag
                        ]
Ejemplo n.º 14
0
# -*- encoding: utf8 -*-
from pyvi.pyvi import ViPosTagger, ViTokenizer

a = ViPosTagger.postagging(
    ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội"))
print a
def add_pos_sen(review):
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS
Ejemplo n.º 16
0
# -*- coding=utf-8 -*-
from pyvi.pyvi import ViTokenizer, ViPosTagger
import requests

test_sent = "Đệ nhất phu nhân Mỹ Melania Trump cảm ơn Chelsea Clinton, con gái của cựu Tổng thống Bill Clinton và cựu Ngoại trưởng Hillary Clinton, vì đã lên tiếng bênh vực con trai 11 tuổi của bà trước những ý kiến trái chiều của dư luận."

url = "http://ai.topica.vn:9119/get_mlbka"

headers = {
    'cache-control': "no-cache",
    'postman-token': "dd327f89-2a5f-bf16-c115-590b590e32c3"
}

response = requests.request("POST", url, data=test_sent, headers=headers)

tach_tu_anh_son = response.text

postag_as = ViPosTagger.postagging(tach_tu_anh_son)
y = zip(postag_as[0], postag_as[1])
print repr(y).decode('unicode-escape')

postaged_sent = ViPosTagger.postagging(
    ViTokenizer.tokenize(test_sent.decode('utf-8')))
x = zip(postaged_sent[0], postaged_sent[1])
print repr(x).decode('unicode-escape')