Esempio n. 1
0
    def convert_st_to_bow(self, st):
        bow = [0] * len(self.words)
        tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st))

        if not (len(tagger[1]) == 1 and tagger[1][0] == 'Np'
                and tagger[0][0] not in SKIP_WORDS):
            tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st.lower()))

        for i, j in enumerate(tagger[1]):
            if j in REPLACE:
                tagger[0][i] = REPLACE[tagger[1][i]]
            if tagger[0][i] in self.words:
                bow[self.words.index(tagger[0][i])] = tagger[0].count(
                    tagger[0][i])
        return np.array(bow)
Esempio n. 2
0
    def separate_sentence(self):
        # this file include financial symbols
        symbol_arr = []
        with open('./data/stockslist.txt', encoding='utf-8') as acro_file:
            lines = acro_file.readlines()
            for line in lines:
                symboli = line.rstrip('\n').split(',')
                symbol_arr.append(symboli[0].lower())
        self.data = self.execute_special_character(self.data)
        sentences = self.data.split('\n')
        new_sentences = []
        for sentence in sentences:
            part_of_sentence = sentence.split('  ')

            for part in part_of_sentence:
                # file.write(part + '\n')
                new_sentences.append(part)
        all_words = []
        for sentence in new_sentences:
            words = ViPosTagger.postagging(ViTokenizer.tokenize(sentence))
            words = self.tokenizer_tunning(words, 1)
            ssi = 'ssi'
            for i, word in enumerate(words):
                if (self.is_stop_word(word) == False):
                    if word not in symbol_arr:
                        all_words.append(word)
                    else:
                        all_words.append(ssi)
        return all_words
Esempio n. 3
0
    def res_sentence(self, test_sentence):
        test_sentence = ViTokenizer.tokenize(test_sentence)
        test_sentence, pos = ViPosTagger.postagging(test_sentence)
        new_words, pos = self.process(test_sentence, pos)
        X_test = self.sent2features(new_words, pos)
        new_tags = self.crf.predict_single(X_test)
        st1, st2 = [], []
        for i in range(len(new_words)):
            if new_tags[i] == 'O':
                if new_tags[i - 1] != 'O':
                    st1.append(new_words[i])
                    st2.append('O')
                    print(i)
                    continue
                else:
                    if i == 0:
                        st1.append(new_words[i])
                        st2.append('O')
                    else:
                        st1[-1] = st1[-1] + '_' + new_words[i]
            elif new_tags[i][0] == 'B':
                tag = "" + new_tags[i][2:]

                st1.append(new_words[i])
                st2.append(tag.upper())

            elif new_tags[i][0] == 'I':
                st1[-1] = st1[-1] + '_' + new_words[i]
        return st1, st2
Esempio n. 4
0
    def sentence_segment(self, text, sw_file='./stopwords', cadidate_pos=['Nc', 'Np', 'S', 'R', 'A', 'C', 'V', 'I']):
        """Store those words only in cadidate_pos"""
        
        # Get stopword
        with open(sw_file, 'r') as f:
            sw = f.readlines()
        for i in range(len(sw)):
            sw[i] = sw[i].strip()

        # word segment
        text = ViTokenizer.tokenize(text)
        text = text.replace('‘', ' ')
        text = text.replace('’', ' ')
        text = text.split('.')
        sentences = []
        for t in text:
            temp = ViPosTagger.postagging(t)
            sentence = []
            for w,t in zip(temp[0], temp[1]):
                if len(w) > 0 and w not in sw and t in cadidate_pos:
                    sentence.append(w)
            sentences.append(sentence)

        temp = []
        for sentence in sentences:
            if len(sentence) >= self.window_size:
                temp.append(sentence)
        return temp
Esempio n. 5
0
def text_postag(text):
    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(text))
    dict_tag = {}
    for i in range(len(pos_tag[0])):
        dict_tag[pos_tag[0][i]] = pos_tag[1][i]

    return dict_tag
Esempio n. 6
0
def pos_pyvi():
    file = open("test_gan_nhan.txt", "r", encoding="utf-8")
    st = file.readlines()
    for i in range(len(st)):
        st[i] = list(map(str, st[i].split()))
        for j in range(len(st[i])):
            st[i][j] = list(map(str, st[i][j].split('/')))
    s1 = list([])
    kq = list([])

    for i in range(len(st)):
        tmp = ""
        for j in range(len(st[i])):
            tmp = tmp + st[i][j][0] + ' '
        s1.append(tmp)
        s = ViPosTagger.postagging(tmp)
        kq.append(list([]))
        for j in range(len(s[0])):
            kq[i].append(list([s[0][j], s[1][j]]))

    H = 0
    T = 0
    for i in range(len(st)):
        for j in range(len(kq[i])):
            if (kq[i][j][1] != "CH"):
                kq[i][j][1] = kq[i][j][1][0]
            if (kq[i][j][1] == "F"):
                kq[i][j][1] = "CH"

        for j in range(len(kq[i])):
            T += 1
            if (j < len(st[i]) and st[i][j][1] == kq[i][j][1]):
                H += 1
    print("Do chinh xac cua gan nhan tu loai cua Pyvi la: " +
          str(float(H) / float(T)))
Esempio n. 7
0
def load_data(data):
    train_data = []
    for line in data:
        line = line.replace(',', '.')
        line = line.split('.')
        for sentence in line:
            sentence = re.sub(r'[():;/%$-@!*&^?><_#+]', ' ', sentence)
            sentence = sentence.replace('"', ' ')
            sentence = sentence.replace("'", " ")
            sentence, pos_tag = ViPosTagger.postagging(sentence)
            for index, pos in enumerate(pos_tag):
                if pos == 'Np':
                    sentence[index] = 'Np'
                if pos == 'Nc':
                    sentence[index] = 'Nc'
                if pos == 'X':
                    sentence[index] = 'X'
                if pos == 'Ny':
                    sentence[index] = 'Ny'
                if pos == 'M':
                    sentence[index] = 'M'
            sentence = [
                word.lower().rstrip('\n') for word in sentence if word != ''
            ]
            if len(sentence) > 1:
                train_data.append(sentence)
    return train_data
def pyvi_prc(text):
    tokens, tags = ViPosTagger.postagging(ViTokenizer.tokenize(text))
    result = {}
    for i in range(len(tokens)):
        tokens[i] = tokens[i].replace('_', ' ')
        result[tokens[i]] = tags[i]
    return result
Esempio n. 9
0
def get_POS_feature(text):
    tag_pos = ViPosTagger.postagging(text)
    vocab = tag_pos[0]
    list_pos = tag_pos[1]
    result = []
    for index,pos in enumerate(list_pos):
        result.append(pos)
    return result
Esempio n. 10
0
def add_postag(comment):
    comment = comment.lower()
    a = ViPosTagger.postagging(ViTokenizer.tokenize(comment))
    X = []
    for i in range(len(a[0])):
        test = a[0][i] + "_" + a[1][i]
        X.append(test)
    b = " ".join(X)
    return b
Esempio n. 11
0
def pos(query):
    query = re.sub(r'[?|$|.|!|<|=|,|\-|\'|\“|\”]', r'', query)
    b = ViPosTagger.postagging(ViTokenizer.tokenize(query))
    important = ['N', 'Nc', 'Ny', 'Np', 'Nu', 'A', 'V']
    result = []
    for i in range(len(b[1])):
        if (b[1][i] in important):
            result.append(b[0][i])
    return ' '.join(result)
Esempio n. 12
0
def get_Word_based_POS(text):
    tag_pos = ViPosTagger.postagging(text)
    vocab = tag_pos[0]
    list_pos = tag_pos[1]
    result = []
    for index,pos in enumerate(list_pos):
        if "N" in pos or "V" in pos or "A" in pos:
            result.append(vocab[index])
    return result
def processing_text(text):
    normalized_text = '. '.join([line.strip() for line in text.split('.')])
    tokenized_text, pos_seqs = ViPosTagger.postagging(
        ViTokenizer.tokenize(normalized_text))
    for i, tag in enumerate(pos_seqs):
        if tag in ['Np', 'Nu', 'M']:
            tokenized_text[i] = tag
    tokens = ' '.join(
        [token for token in tokenized_text if token not in string.punctuation])
    return tokens
Esempio n. 14
0
    def tokenize(string: str):
        tokenized = ViTokenizer.tokenize(string)
        words, label = ViPosTagger.postagging(tokenized)

        for idx, word in enumerate(words):
            word = word.replace('_', " ")
            words[idx] = word

        words = [w for w in words if w]

        return words
Esempio n. 15
0
 def posvt():
     m = Frame.m
     txt = m.get()
     seg = ViPosTagger.postagging(ViTokenizer.tokenize(txt))
     print(seg)
     pyperclip.copy(" ".join(seg))
     root10 = tk.Tk()
     label0 = tk.Label(root10, text=seg, font=16)
     label0.pack(fill="x")
     root10.title('Result(POS-VT)')
     root10.mainloop()
Esempio n. 16
0
def extract_name(text, stopwords):
    tokenized_text = ViTokenizer.tokenize(text)
    tokenized_text = clean_text(tokenized_text, stopwords)
    words, tags = ViPosTagger.postagging(tokenized_text)

    res = []

    for i in range(len(words)):
        if (tags[i] == "Np"):
            # print(words[i])
            res.append(words[i].replace("_", " "))

    return res
Esempio n. 17
0
def load_new_stopwords(text):
    not_labels = ['A', 'L', 'R', 'T', 'E', 'M', 'I']
    stop_words = utils.load_stop_words('stopwords.txt')
    stop_words += list(string.punctuation)
    stop_word_mini = []
    text = text.split()
    tokens, postag = ViPosTagger.postagging_tokens(text)
    for i in range(len(tokens)):
        if postag[i] in not_labels:
            stop_word_mini.append(tokens[i].lower())
    stop_words += stop_word_mini
    stop_words = set(stop_words)
    return stop_words
Esempio n. 18
0
    def _sanitize(self):
        '''
        Trims nonessential words such as 'and', 'or', 'for'
        Parts of Speech types:
        http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        For vietnamese, see: https://pypi.org/project/pyvi/
        '''
        words_list = []
        if (len(self.query_str) == len(self.query_str.encode('utf-8'))):
            tags_to_keep = [
                'NN',
                'NNS',
                'NNP',
                'NNPS',  # noun types
                'VB',
                'VBD',
                'VBG',
                'VBN',
                'VBP',
                'VBZ',  # verb types
                'JJ',
                'JJR',
                'JJS',  # adjective types
                'RB',
                'RBR',
                'RBS',  # adverbs
                'CD',
                'FW'
            ]
            tokens = nltk.word_tokenize(self.query_str)
            tags = nltk.pos_tag(tokens)
            for tag in tags:
                if tag[1] in tags_to_keep:
                    words_list.append(tag[0])
        else:
            #Vietnamese?
            tags_to_keep = ['N', 'Ny', 'Np', 'V', 'A', 'R', 'M', 'X']
            tokens = ViTokenizer.tokenize(self.query_str)
            tags = ViPosTagger.postagging(tokens)
            for index in range(len(tags[0])):
                if tags[1][index] in tags_to_keep:
                    words_list.append(tags[0][index].replace('_', ' '))

        new_query_str = ' '.join(words_list)
        if len(new_query_str) == 0:
            new_query_str = self.query_str

        self.deleted_words += len(self.query_str.split()) - len(
            new_query_str.split())
        self.query_str = new_query_str
Esempio n. 19
0
    def create_and_train(self):

        for key, value in self.json_data.items():

            if 'patterns' not in value:
                continue

            for pattern in value['patterns']:
                tagger = ViPosTagger.postagging(ViTokenizer.tokenize(pattern))
                w = []
                for i, j in enumerate(tagger[1]):
                    if j in REPLACE:
                        tagger[0][i] = REPLACE[j]
                    if j not in POS_TAG and tagger[0][i] not in STOP_WORDS:
                        w.append(tagger[0][i])
                self.words.extend(w)

                self.documents.append((w, key))

            self.classes.append(key)

        self.words = sorted(list(set(self.words)))

        training = []

        for doc in self.documents:
            st_out = [0] * len(self.words)

            for w in doc[0]:
                st_out[self.words.index(w)] = doc[0].count(w)

            class_out = [0] * len(self.classes)
            class_out[self.classes.index(doc[1])] = 1

            training.append([st_out, class_out])

        random.shuffle(training)
        training = np.array(training)
        self.train_x = list(training[:, 0])
        self.train_y = list(training[:, 1])

        pickle.dump(
            {
                "documents": self.documents,
                "classes": self.classes,
                "words": self.words
            }, open(SAVE_FILE, "wb"))

        self.model = Model()
        self.model.train(self.train_x, self.train_y)
Esempio n. 20
0
def get_topic_word_in_sentence(sentence):
    topic_words, other_words = [], []
    # token = list(ViTokenizer.tokenize(setence).split())
    token = list(
        requests.post(url=url_token, data={
            "text": sentence
        }).text.split())
    tokens = " ".join((t for t in token if len(t) > 1))
    token_pos = ViPosTagger.postagging(tokens)
    for i in range(len(token_pos[0])):
        # print(token_pos[0][i], token_pos[1][i])
        if token_pos[1][i] in noun:
            topic_words.append(token_pos[0][i])
        else:
            other_words.append(token_pos[0][i])
    return topic_words, other_words
Esempio n. 21
0
def locKiTuDacBiet(s):
    s1 = ""
    s2 = ""
    if s != 'href' and s != 'class' and s != 'hashtag-link' and s != '\n' and s != 'thì' and s != 'là' and s != 'ở' and s != 'đi' and s != 'tao' and s != 'mày' and s != 'cây' and s != 'đến' and s != 'vừng' and s != 'bán' and s != 'đồ ăn' and s != 'Đồ ăn' and s != 'cơm_chiên' and s != 'vô' and s != 'cách' and s != 'đây' and s != 'Vị_trí' and s != 'bánh_bao' and s != 'Kem' and s != 'từ' and s != 'ngoài' and s != 'vô' and s != 'của' and s != 'xe' and s != 'thứ' and s != 'hôm' and s != 'đó' and s != 'kho' and s != 'quẹt' and s != 'buổi_sáng' and s != 'Xe_đẩy' and s != 'decor' and s != 'i' and s != 'o' and s != 'đươ' and s != 'c' and s != 'n' and s != 'cu' and s != '_' and s != 'service' and s != 'Menu' and s != 'bad' and s != 'ㅠ' and s != 'bill' and s != 'Matcha' and s != 'green' and s != 'almond' and s != 'chocolate' and s != 'PERFECT' and s != 'kpop' and s != 'SG' and s != 'upstair' and s != 'driving' and s != 'in' and s != 'to' and s != 'check' and s != 'say' and s != 'ran':
        for i in range(len(s)):
            if s[i] != '!' and s[i] != '/' and s[i] != '.' and s[
                    i] != "'":  #and s[i]!='['and s[i]!=']'and s[i]!='|'and s[i]!='{'and s[i]!='}'and s[i]!=';'and s[i]!=','
                s1 += s[i]
    s1.rstrip("\n")
    s3 = ViPosTagger.postagging(ViTokenizer.tokenize(u"%s" % s1))
    for i in range(len(s3[0])):
        if s3[1][i] != 'N' and s3[1][i] != 'Np' and s3[1][i] != 'P' and s3[1][
                i] != 'E' and s3[1][i] != 'T' and s3[1][i] != 'L' and s3[1][
                    i] != 'M':
            s2 += s3[0][i] + " "
    return s2
Esempio n. 22
0
def fetch_data(st, t):
    tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st))
    for i, j in enumerate(tagger[1]):
        if j == 'Np' and t == 'NAME':
            return tagger[0][i]
        if j == 'M':
            if t == 'NUMBER':
                return tagger[0][i]
            if t == 'DATE':
                str_date = tagger[0][i]
                try:
                    return datetime.datetime.strptime(str_date, "%d/%m/%Y")
                except ValueError:
                    return (
                        "Bạn chưa nhập đúng định dạng ngày theo ngày/tháng/năm hoặc ngày nhập không hợp lệ",
                        1)
    return False
Esempio n. 23
0
def get_kqxs(st, session, data):

    tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st.title()))

    if tagger[1][-1] == 'Np':
        p = tagger[0][-1]

    url = 'http://xskt.com.vn/rss-feed/'

    rss = {
        'Bắc': 'mien-bac-xsmb.rss',
        'Nam': 'mien-nam-xsmn.rss',
        'Trung': 'mien-trung-xsmt.rss',
        'Bình_Định': 'binh-dinh-xsbdi.rss',
        'Đắc_Lắk': 'dac-lac-xsdlk.rss',
        'Đà Nẵng': 'da-nang-xsdng.rss',
        'Đắc_Nông': 'dac-nong-xsdno.rss',
        'TP.HCM': 'tp-hcm-xshcm.rss',
        'Sài_Gòn': 'tp-hcm-xshcm.rss',
        'Hcm': 'tp-hcm-xshcm.rss',
        'Quảng_Ngãi': 'quang-ngai-xsqng.rss',
        'Quảng_Nam': 'quang-nam-xsqnm.rss'
    }

    try:
        link = url + rss[p]
    except KeyError:
        p = no_accent_vietnamese(p.lower().replace("_", " "))
        vt = ""
        for i in p.split():
            vt += i[0]
        link = url + p.replace(" ", "-") + ("-xs{0}.rss").format(vt)

    feed = feedparser.parse(link)

    content = ""

    if len(feed['items']) == 0:
        feed = feedparser.parse(url + rss['Bắc'])

    for item in feed['items']:
        content += "<div><strong>" + item['title'] + "</strong></div>"
        content += "<div>" + item['summary'].replace("[", "<br/>[").replace(
            "8:", " 8:") + "</div>"

    return content
Esempio n. 24
0
def get_bmi(st, track, data):
    st = st.lower()

    try:
        if st.index("cm"):
            st = st.replace("cm", " cm")
    except ValueError:
        st = st.replace("m", " m")
    st = st.replace("kg", " kg")
    arr = []

    tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st))
    for i, j in enumerate(tagger[1]):
        if j == 'M':
            arr.append(float(tagger[0][i]))
        if j == 'Nu' and tagger[0][i] == 'cm':
            arr[0] = arr[0] / 100

    if (len(arr) < 2):
        return ({"text": "Cho em chỉ số về chiều cao (m) và cân nặng (kg)"}, 1)

    arr = sorted(arr)

    h = arr[0]
    w = arr[1]

    bmi = w / (h * h)

    if (bmi < 18.5):
        tt = "gầy"
    elif (bmi < 24.9):
        tt = "bình thường"
    elif (bmi < 29.9):
        tt = "hơi béo"
    elif (bmi < 34.9):
        tt = "béo phì cấp độ 1"
    elif (bmi < 39.9):
        tt = "béo phì cấp độ 2"
    else:
        tt = "béo phì cấp độ 3"

    return ({
        "text":
        "Chỉ số BMI của bạn là: %.2f.\nTình trạng hiện tại: %s" % (bmi, tt)
    }, 1)
Esempio n. 25
0
    def get_L_sentences(self, s):
        from pyvi import ViTokenizer, ViPosTagger
        LSeg, LPOS = ViPosTagger.postagging(ViTokenizer.tokenize(s))

        LRtn = []
        for i, (segment, pos) in enumerate(zip(LSeg, LPOS), start=1):
            LRtn.append(
                CubeItem(index=i,
                         word=segment.replace('_', ' '),
                         lemma=segment.replace('_', ' '),
                         upos=DPOSToCube[pos],
                         xpos='',
                         attrs='',
                         head='',
                         label='',
                         space_after='_'
                         if i != len(LSeg) - 1 else 'SpaceAfter=No'))
        return [LRtn]
Esempio n. 26
0
    def get_data_for_training(self):
        with open(self.PATH, encoding='utf-8') as json_file:
            training_data = json.load(json_file)
            for pattern in training_data:
                wTV = ViPosTagger.postagging(
                    ViTokenizer.tokenize((pattern['sentence'])))
                self.words.extend(wTV[0])
                self.documents.append((wTV[0], pattern['class']))
                if pattern['class'] not in self.classes:
                    self.classes.append(pattern['class'])

        self.output_empty = [0] * len(self.classes)
        self.words = [
            w.lower() for w in self.words if w not in self.ignore_words
        ]
        self.words = sorted(list(set(self.words)))
        # prepare for predicting
        self.load_w()
Esempio n. 27
0
def Process_V_Sentence(sent):
    #tach tu va lay nhan cho tu
    processed_sent = []
    list_tagged_word = ViPosTagger.postagging(ViTokenizer.tokenize(sent))
    list_word = list_tagged_word[0]  #lay ra cac tu
    list_pos = list_tagged_word[1]  #lay ra cac nhan tuong ung voi cac tu
    for i in range(len(list_word)):
        #them tu vao cau
        if list_pos[i] not in ['F', 'P']:
            if list_pos[i] not in ['Ny', 'Np']:
                list_word[i] = list_word[i].lower()
            list_word[i] = list_word[i].replace('_', ' ')
            processed_sent.append(list_word[i])

    #loai bo cac stop word trong cau
    processed_sent = Eliminate_V_Stop_Word("v_stopwords.txt", processed_sent)

    return processed_sent
Esempio n. 28
0
def Get_List_V_Sent(paragraph):
    list_sents = []
    words_in_sent = []
    list_tagged_word = ViPosTagger.postagging(ViTokenizer.tokenize(paragraph))
    list_word = list_tagged_word[0]  #lay ra cac tu
    list_pos = list_tagged_word[1]  #lay ra cac nhan tuong ung voi cac tu
    for i in range(len(list_word)):
        #them tu vao cau
        if list_pos[i] != 'F':
            list_word[i] = list_word[i].replace('_', ' ')
            words_in_sent.append(list_word[i])
        if (list_pos[i] == 'F') and (list_word[i]
                                     in '.!?'):  #neu gap dau ket thuc cau
            sentence = ' '.join(words_in_sent)
            sentence += list_word[i]  #them dau ket thuc cau
            list_sents.append(sentence)
            words_in_sent = []

    return list_sents
Esempio n. 29
0
def create_pos(sentence):
    sentence = preprocess(sentence)
    pos = ViPosTagger.postagging(ViTokenizer.tokenize(sentence))
    result = []
    final = []
    for i in range(len(pos[0])):
        result.append((pos[0][i], pos[-1][i]))
    result = postprocess_pos(result)
    i = 0
    while i < len(result):
        if i < len(result) - 2 and pos_tags[result[i][-1]] == pos_tags[result[i+2][-1]] == "numeral" and pos_tags[result[i+1][-1]] == "punctuation":
            temp = (result[i][0] + result[i+1][0] +
                    result[i+2][0], name_to_pos["numeral"])
            i += 3
        else:
            temp = result[i]
            i += 1
        final.append(temp)
    return final
Esempio n. 30
0
    def remove_stopword_sent(self, sent):
        new_tokens = []
        new_pos = []
        print("before res:", sent)
        sent = self.restore_acronym(sent)
        print("after res:", sent)
        s = ViPosTagger.postagging(ViTokenizer.tokenize(sent))
        for i in range(len(s[0])):
            if s[0][i] in self.stock_code:
                new_tokens.append(s[0][i])
                new_pos.append("Np")
            elif self.is_stop_word(s[0][i]):
                if (s[0][i] == "như"):
                    print("S[i]", s[0][i])
                    a = 0
            else:
                new_tokens.append(s[0][i])
                new_pos.append(s[1][i])

        return (new_tokens, new_pos)