def review_to_words(review, filename):
    """
    Function to convert a raw review to a string of words
    :param review
    :return: meaningful_words
    """
    # 1. Convert to lower case, split into individual words
    # words = review.lower().split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(review,
                                     encoding='utf-8')))  # gan nhan POS
    words = review.split()
    # 2. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in array]

    # 4. Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(
        ViTokenizer.tokenize(unicode(b, encoding='utf-8')))  # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)
Example #2
0
def predict_ex(mes):
    vectorizer = load_model('model/vectorizer.pkl')
    uni_big = load_model('model/uni_big.pkl')
    if uni_big == None:
        training1()
    uni_big = load_model('model/uni_big.pkl')
    print "---------------------------"
    print "Training"
    print "---------------------------"
    t0 = time.time()
    # iterate over classifiers

    mes = unicode(mes, encoding='utf-8')
    test_message = ViTokenizer.tokenize(mes).encode('utf8')
    test_message = clean_str_vn(test_message)
    test_message = list_words(test_message)
    clean_test_reviews = []
    clean_test_reviews.append(test_message)
    d2 = {"message": clean_test_reviews}
    test2 = pd.DataFrame(d2)
    test_text2 = test2["message"].values.astype('str')
    test_data_features = vectorizer.transform(test_text2)
    test_data_features = test_data_features.toarray()
    # print test_data_features
    s = uni_big.predict(test_data_features)[0]
    return s
def predict_ex(mes):
    print mes
    vectorizer = load_model('model_balance/vectorizer_tfidf12.pkl')
    clf = load_model('model_balance/tfidf12.pkl')
    clf2 = load_model('model_balance/tfidf_fine12.pkl')
    if clf is None or clf2 is None:
        training1()
        clf = load_model('model/model_balance/tfidf12')
        clf2 = load_model('model_balance/tfidf_fine12.pkl')

    mes = unicodedata.normalize("NFC", mes.strip())
    mes = clean_str_vn(mes)
    test_message = ViTokenizer.tokenize(mes).encode('utf8')
    test_message = clean_str_vn(test_message)
    test_message = list_words(test_message)
    clean_test_reviews = []
    clean_test_reviews.append(test_message)
    d2 = {"message": clean_test_reviews}
    test2 = pd.DataFrame(d2)
    test_text2 = test2["message"].values.astype('str')
    test_data_features = vectorizer.transform(test_text2)
    test_data_features = test_data_features.toarray()
    # print test_data_features
    s = clf.predict(test_data_features)[0]
    s2 = clf2.predict(test_data_features)[0]
    return s + " " + s2
Example #4
0
def predict_ex(mes):
    vectorizer = load_model('model/vectorizer.pkl')
    # vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.7, min_df=2, max_features=1000)
    if vectorizer == None:
        vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                     max_df=0.7,
                                     min_df=2,
                                     max_features=1000)

    clf = load_model('model/clf.pkl')
    if clf == None:
        training()

    clf = load_model('model/clf.pkl')
    mes = unicode(mes, encoding='utf-8')
    test_message = ViTokenizer.tokenize(mes).encode('utf8')
    test_message = clean_str_vn(test_message)
    test_message = review_to_words(test_message)
    clean_test_reviews = []
    clean_test_reviews.append(test_message)
    d2 = {"message": clean_test_reviews}
    test2 = pd.DataFrame(d2)
    test_text2 = test2["message"].values.astype('str')
    test_data_features = vectorizer.transform(test_text2)
    test_data_features = test_data_features.toarray()
    # print test_data_features
    s = clf.predict(test_data_features)
    s2 = np.array(s)
    s3 = str(s2[0])
    return s3
def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split()
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False))


#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else:  # Mostly european languages
        words = sent.split()

    return words
Example #6
0
def load_text(doc):
    dataset = {'target_names': [], 'data': [], 'target': []}
    content = doc.lower()
    rx = re.compile("[^\W\d_]+", re.UNICODE)
    content = " ".join(rx.findall(content))
    dataset['data'].append(ViTokenizer.tokenize(content))
    return dataset
Example #7
0
def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split() 
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()        
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False)) 
#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else: # Mostly european languages
        words = sent.split()
    
    return words
def my_tokenize(posts_str):
    posts = posts_str.split('(^-^)')

    tokens = []
    for post in posts:
        post = utils.icons(post)
        tokens.extend(ViTokenizer.tokenize(post).split(' '))
    return tokens
Example #9
0
def word_segment(root):
    directory = 'seg/%s' % get_container_folder(root)
    ut.create_folder(directory)
    files = [
        f for f in os.listdir(root) if os.path.isfile('%s/%s' % (root, f))
    ]
    total = len(files)
    for index, f in enumerate(files):
        path = '%s/%s' % (root, f)
        content = ut.load_file(path)
        if len(content) >= 3:
            title = content[0].replace('\n', '')
            par = content[2].replace('\n', '')
            title = ViTokenizer.tokenize(unicode(title, 'UTF-8'))
            par = ViTokenizer.tokenize(unicode(par, 'UTF-8'))
            ut.save_file_utf8('%s/%s' % (directory, f), title + '\n' + par)
        ut.update_progress((index + 1) * 1.0 / total)
Example #10
0
def tokenize(filename, outname, delimiter="\t"):
    with open(filename, "r", encoding="utf-8") as source, open(outname, "w", encoding="utf-8") as target:
        for i, line in enumerate(source):
            print(i)
            tokens = line.strip().split(delimiter)
            for j in [1, 3, 7, 8]:
                tokens[j] = ViTokenizer.tokenize(tokens[j])
            target.write(delimiter.join(tokens) + "\n")
Example #11
0
def vitokenizer(input_text):
    input_text = unicoded(input_text)
    input_text = ViTokenizer.tokenize(input_text)

    #Turn result into usable format to input into Vectorizer(tokenizer=)
    input_text = input_text.split()
    input_text = [x.replace('_', ' ') for x in input_text]
    return input_text
def load_data(filename, dict):
    res = []
    col1 = []
    col2 = []
    col3 = []
    col4 = []

    with open(filename, 'r') as f, open(dict, "w") as f2:
        for line in f:
            label1, p, label2, question = line.split(" ", 3)
            question = review_to_words(question,
                                       'datavn/question_stopwords.txt')
            # question = review_add_pos(question,'datavn/question_stopwords.txt')
            col1.append(label1)
            col2.append(label2)
            col3.append(question)

        ngram = ngrams_array(col3,
                             2)  # tu dien cac tu va so lan xuat hien cua no
        dict_arr = []  # list cac tu co tan suat < 1
        for x in ngram:
            p = ngram.get(x)
            if p < 1:
                dict_arr.append(x)
                f2.write(x + "\n")
        col4 = []
        for q in col3:
            r1 = []
            r2 = []
            q = review_to_words2(q, dict, 2)  # q la 1 cau
            q1 = [' '.join(x) for x in ngrams(q, 1)]  # q1:mang cac 1-grams
            s1 = ViPosTagger.postagging(
                ViTokenizer.tokenize(unicode(
                    q, encoding='utf-8')))  # gan nhan POS
            for i1, i2 in zip(s1[0], s1[1]):
                t1 = i1 + "_" + i2
                t1 = t1.encode('utf-8')
                r1.append(t1)
            s2 = ' '.join(
                i
                for i in s1[1])  # Nhan tu loai cua cau dang str. vd: "N V E N"
            q2 = [' '.join(x) for x in ngrams(q, 2)
                  ]  # q2: mang cac phan tu 2-grams la word
            s22 = [' '.join(x) for x in ngrams(s2, 2)
                   ]  # s22: mang cac phan tu 2-grams la tag
            q3 = (' '.join(x.replace(' ', '_') for x in q2)).split()
            s3 = (' '.join(x.replace(' ', '_') for x in s22)).split()
            for i1, i2 in zip(q3, s3):
                t2 = i1 + "_" + i2
                r2.append(t2)
            y = r1 + r2
            # z1 = [' '.join(x) for x in y]
            z = ' '.join(y)
            col4.append(z)
            # col4.append(q)
        d = {"label1": col1, "label2": col2, "question": col4}
        train = pd.DataFrame(d)
    return train
Example #13
0
    def get_entity(sentence):
        entity_list = [line.rstrip('\n') for line in open('data/entity.dat')]
        sentence_words = ViTokenizer.tokenize(sentence).split(' ')

        entity = [
            stemmer.stem(word.lower()) for word in sentence_words
            if word in entity_list
        ]
        return entity
Example #14
0
def process_chunk(chunk):
    print "Process chunk of size " + str(len(chunk))
    ret = []
    for line in chunk:
        line = line.strip()
        if len(line) > 0:
            out = ViTokenizer.tokenize(line)
            ret.append(out)
    return ret
Example #15
0
def token_data(raw_texts, max_sent_length):
    output_texts = []
    len_texts = []
    token_texts = sent_tokenize(raw_texts)
    for text in token_texts:
        token_text = ViTokenizer.tokenize(text).split()
        len_text = len(token_text)
        len_texts.append(len_text)
        output_texts += [token_text[i:i + max_sent_length] for i in xrange(0, len(token_text), max_sent_length)]
    return output_texts, len_texts
Example #16
0
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = ViTokenizer()
        self.pos_tagger = ViPosTagger()

    def fit(self, *_):
        return self

    def transform(self, X, y=None, **fit_params):
        result = X.apply(lambda text: self.tokenizer.tokenize(text))
        return result
Example #17
0
def clean_up_sentence(sentence):
    ignore_words = ['?', '!', ',', '.', 'xin_lỗi', 'và', 'ạ']
    sentence_words = w = ViTokenizer.tokenize(sentence).split(' ')

    sentence_words = [
        stemmer.stem(word.lower()) for word in sentence_words
        if word not in ignore_words
    ]
    sentence_words = ngrams(w, 4, [])

    return sentence_words
Example #18
0
def review_add_pos(review, filename):
    words = review.split()
    with open(filename, "r") as f3:
        dict_data = f3.read()
        array = dict_data.splitlines()

    meaningful_words = [w for w in words if not w in array]
    b = " ".join(meaningful_words)  # cau sau khi loai bo stopword
    words_list = b.split()
    tup = ViPosTagger.postagging(ViTokenizer.tokenize(unicode(b,encoding='utf-8')))   # gan nhan POS
    a = tup[1]
    c = words_list + a
    return " ".join(c)
Example #19
0
def load_dataset(folder):
    dataset = {'target_names': [], 'data': [], 'target': []}
    print('loading dataset')
    for root, dirs, files in os.walk(folder, topdown=False):
        position = 0
        for name in dirs:
            subdir = os.path.join(root, name)
            dataset['target_names'].append(name)
            filesPath = get_filepaths(subdir)
            for filePath in filesPath:
                with io.open(filePath, mode="r", encoding="UTF8") as file:
                    content = file.read().lower()
                    rx = re.compile("[^\W\d_]+", re.UNICODE)
                    content = " ".join(rx.findall(content))
                    dataset['data'].append(ViTokenizer.tokenize(content))
                    dataset['target'].append(position)
            position += 1

    return dataset
def clean_doc(question):

    question = regex_email(question)
    question = regex_phone_number(question)
    question = regex_link(question)

    if type(question) != unicode:
        question = unicode(question, encoding='utf-8')
    question = accent(question)
    # question = tokenizer.predict(question)  # tu them dau . vao cuoi cau
    question = ViTokenizer.tokenize(question)
    print question
    rm_junk_mark = re.compile(ur'[?,\.\n]')
    normalize_special_mark = re.compile(
        ur'(?P<special_mark>[\.,\(\)\[\]\{\};!?:“”\"\'/])')
    question = normalize_special_mark.sub(u' \g<special_mark> ', question)
    question = rm_junk_mark.sub(u'', question)
    question = re.sub(' +', ' ',
                      question)  # remove multiple spaces in a string
    return question
Example #21
0
def nb():
    if request.method == 'GET':
        return render_template('index.html')
    else:
        try:
            document = request.form['document']
            document = ViTokenizer.tokenize(document)
            if document.strip() == '':
                return render_template('index.html',
                                       message='Please enter your document.')
            print(document)
            message = LABELS[nb_model.detect_one(document)]
            print(message)
            return render_template('index.html',
                                   message=message,
                                   document=document)
        except Exception as e:
            traceback.print_exc()
            return render_template(
                'index.html',
                message='Check error. See log file for detail.',
                document=document)
Example #22
0
def ner_crf(question):
    text = ViPosTagger.postagging(ViTokenizer.tokenize(question))
    detect = []
    ar = []
    for i in range(len(text[0])):
        l = []
        l.append(text[0][i])
        l.append(text[1][i])
        ar.append(tuple(l))
    detect.append(ar)
    X_detect = [sent2features(s) for s in detect]
    tagger = pycrfsuite.Tagger()
    tagger.open('crf.model')
    y_detect = [tagger.tag(xseq) for xseq in X_detect]
    pred = []
    for i in range(len(detect[0])):
        k = detect[0][i][0]
        v = y_detect[0][i]
        kv = []
        kv.append(k)
        kv.append(v)
        pred.append(tuple(kv))
    return pred
Example #23
0
def main(argv):
    input_file = 'input.txt'
    output_file = 'output.txt'
    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
    except getopt.GetoptError:
        print 'test.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test.py -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            input_file = arg
        elif opt in ("-o", "--ofile"):
            output_file = arg
    if not exists(input_file):
        print "Cannot open", input_file
        return
    content = open(input_file, "r").read()
    content = content.decode("utf-8")
    output = ViTokenizer.tokenize(content)
    output = output.encode("utf-8")
    open(output_file, "w").write(output)
Example #24
0
def alt(array):
    files = []
    tfidf = []
    wordDict = []
    newA = []
    tf = []
    q = []
    z = []
    u_neg = []
    u_pos = []
    u_test = []
    count_neg = 0
    count_pos = 0
    #task 1

    path_neg = '/home/rindem/Desktop/bag_of_word_auth/training/negative'
    obj1 = open(path_neg, "r")
    str1 = obj1.read()
    files_neg = str1.split("\n\n")
    # print len(files_neg)
    obj1.close()

    path_pos = '/home/rindem/Desktop/bag_of_word_auth/training/positive'
    obj2 = open(path_pos, "r")
    str2 = obj2.read()
    files_pos = str2.split("\n\n")
    # print len(files_pos)
    obj2.close()
    files.append(array)
    files.extend(files_neg)
    files.extend(files_pos)
    #  print len(files)

    for value in range(len(files)):
        decode = files[value].decode('utf-8')
        tmp = ViTokenizer.tokenize(decode)
        split = tmp.split(" ")
        newA.append(split)

    # mang 2 chieu luu tach tu
    union = set.union(*(set(value) for value in newA))
    for val in range(len(files)):
        wordDict.append(dict.fromkeys(union, 0))

    for num in range(len(newA)):
        for word in newA[num]:
            wordDict[num][word] += 1

    #tf
    for val in range(len(wordDict)):
        tfBow = computeTF(wordDict[val], newA[val])
        tf.append(tfBow)

    #idf
    idfs = computeIDF(wordDict)

    #tfidf
    for val in tf:
        tfidfBow = computeTFIDF(val, idfs)
        tfidf.append(tfidfBow)

    x_neg = dict.fromkeys(tfidf[0].keys(), 0)
    x_pos = dict.fromkeys(tfidf[0].keys(), 0)
    x_test = tfidf[0]
    longNum = len(newA)
    for num in range(1, ((longNum - 1) / 2) + 1):
        for word in newA[num]:
            x_neg[word] += tfidf[num][word]

    for num in range(((longNum - 1) / 2) + 1, longNum):
        for word in newA[num]:
            x_pos[word] += tfidf[num][word]

    for word, val in x_neg.items():
        u_neg.append(x_neg[word])

    for word, val in x_pos.items():
        u_pos.append(x_pos[word])

    for word, val in x_test.items():
        u_test.append(x_test[word])

# print "\n"
# print "Compare test vs neg: ", space(u_test,u_neg)
    tmp = space(u_test, u_neg)
    # print "Compare test vs pos", space(u_test,u_pos)
    # print "\n"
    temp = space(u_test, u_pos)
    if (compare(tmp, temp) == tmp):
        return 1
    else:
        return 2
Example #25
0
    for row in csvReader:
        count = count + 1

        if (count % 2) == 1:
            utf = unicode(row[0], "utf-8")
            allstr = allstr + utf
            arr.append(utf)
        else:
            st = row[0].strip('\n')
            st = st.strip('\r')
            st = st.strip('\n')
            labels.append(st)

# Create diction
allstr = allstr.replace(",", "").replace(".", "")
allstr = ViTokenizer.tokenize(allstr)
allstr = allstr.lower()
diction = allstr.split()
diction = list(set(diction))

# Write diction to file
write(diction, "diction.file")
diction = read("diction.file")

print(len(diction))

data = []

# Predict
pre = pre.replace(",", "").replace(".", "")
pre = ViTokenizer.tokenize(pre)
Example #26
0
 def segmentation(self):
     return ViTokenizer.tokenize(self.text)
Example #27
0
def tokenize(comment):
    text_token = ViTokenizer.tokenize(comment)
    return text_token
Example #28
0
from __future__ import unicode_literals
from __future__ import print_function
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from pyvi.pyvi import ViTokenizer, ViPosTagger
import sys

# train raw data
dt1 = u"Hà Nội Phở Cháo Lòng Hà Nội Cháo Trai"
dt2 = u"Hà Nội Bún Chả Phở Ô Mai Lẩu Ếch"
dt3 = u"Phở Bánh Giò Ô Mai"
dt4 = u"Sài Gòn Hủ Tiếu Bánh Bò Phở Bún Nem"
dt5 = u"Hà Nội Hà Nội Bún Chả Hủ Tiếu Nem Gián Cơm Gà Phở"

#VNTokenizer
dt1 = ViTokenizer.tokenize(dt1)
dt2 = ViTokenizer.tokenize(dt2)
dt3 = ViTokenizer.tokenize(dt3)
dt4 = ViTokenizer.tokenize(dt4)
dt5 = ViTokenizer.tokenize(dt5)

print(isinstance(dt1, (str, unicode)))

# Dictionary
arr1 = dt1.split()
arr2 = dt2.split()
arr3 = dt3.split()
arr4 = dt4.split()
arr5 = dt5.split()
arr = arr1 + arr2 + arr3 + arr4
Example #29
0
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf


f = open('test/dia chi.txt', 'r')
str1 = f.read()
z = str1.decode('utf-8')

q = open('test/ten.txt', 'r')
str2 = q.read()
print type(str2)
t = str2.decode('utf-8')
print len(t)
x = ViTokenizer.tokenize(z)
x2 = ViTokenizer.tokenize(t)
#x = ViPosTagger.postagging(ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội"))

y = x.split(" ")
y2 = x2.split(" ")
aList = []
bList = []

for index in range(len(y)):
    tmp = y[index]
    aList.append(tmp)

for index in range(len(aList)):
    print aList[index]
Example #30
0
newA = []
tf = []
path = 'training/negative'

obj2 = open(path, "r")
str1 = obj2.read()
files = str1.split("\n\n")
obj2.close()
for val in files:
    print "//////"
    print val
    print "/////"

for value in range(len(files)):
    decode = files[value].decode('utf-8')
    tmp = ViTokenizer.tokenize(decode)
    split = tmp.split(" ")
    newA.append(split)

# mang 2 chieu luu tach tu
union = set.union(*(set(value) for value in newA))
for val in range(len(files)):
    wordDict.append(dict.fromkeys(union, 0))

for num in range(len(newA)):
    for word in newA[num]:
        wordDict[num][word] += 1

#tf
for val in range(len(wordDict)):
    tfBow = computeTF(wordDict[val], newA[val])
Example #31
0
def tokenize_text(sentence: str, format=None):
    tokenized_text = ViTokenizer.tokenize(sentence)
    if format == 'list':
        return [re.sub('_', ' ', w) for w in tokenized_text.split()]
    else:
        return tokenized_text