def read_pretrain_embedding(self):
     '''
     Load word vector from pythainlp and transform into dict word->vector
     '''
     model = word_vector.get_model()
     embedding = {}
     for word in model.index2word:
         embedding[word] = model[word]
     embedding_dim = list(embedding.values())[0].shape[0]
     return embedding, embedding_dim
Esempio n. 2
0
 def test_thai2vec(self):
     self.assertGreaterEqual(word_vector.similarity("แบคทีเรีย", "คน"), 0)
     self.assertIsNotNone(word_vector.sentence_vectorizer(""))
     self.assertIsNotNone(word_vector.get_model())
     self.assertIsNotNone(
         word_vector.sentence_vectorizer("เสรีภาพในการชุมนุม")
     )
     self.assertIsNotNone(
         word_vector.sentence_vectorizer(
             "เสรีภาพในการรวมตัว\nสมาคม", use_mean=True
         )
     )
     self.assertIsNotNone(
         word_vector.sentence_vectorizer("I คิด therefore I am ผ็ฎ์")
     )
     self.assertIsNotNone(
         word_vector.most_similar_cosmul(
             ["สหรัฐอเมริกา", "ประธานาธิบดี"], ["ประเทศไทย"]
         )[0][0]
     )
     self.assertEqual(
         word_vector.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม"
     )
     _wv = WordVector("thai2fit_wv")
     self.assertGreaterEqual(
         _wv.similarity("แบคทีเรีย", "คน"), 0
     )
     self.assertIsNotNone(_wv.sentence_vectorizer(""))
     self.assertIsNotNone(_wv.get_model())
     self.assertIsNotNone(
         _wv.sentence_vectorizer("เสรีภาพในการชุมนุม")
     )
     self.assertIsNotNone(
         _wv.sentence_vectorizer(
             "เสรีภาพในการรวมตัว\nสมาคม", use_mean=True
         )
     )
     self.assertIsNotNone(
         _wv.sentence_vectorizer("I คิด therefore I am ผ็ฎ์")
     )
     self.assertIsNotNone(
         _wv.most_similar_cosmul(
             ["สหรัฐอเมริกา", "ประธานาธิบดี"], ["ประเทศไทย"]
         )[0][0]
     )
     self.assertEqual(
         _wv.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม"
     )
Esempio n. 3
0
 def predict(self, sentence):
     word2vec = word_vector.get_model()
     output = DataCleaner.process_query(sentence, self.model, self.word2vec,
                                        self.max_len)
     return str(round(output * 100, 2)) + '%'
Esempio n. 4
0
 def __init__(self):
     self.model = load_model('../model_files/minmaimin_LSTM.h5')
     # self.deka_file = pd.read_csv("../model_files/dataframe.csv")
     self.max_len = 100
     self.word2vec = word_vector.get_model()
Esempio n. 5
0
def spell(text, autocorrect=False, worddict=None):
    global check, checking
    word_cut = word_tokenize(text)
    if worddict == "thai2fit" and checking == "":
        from pythainlp.word_vector import get_model
        words = get_model().index2word
        w_rank = {}
        for i, word in enumerate(words):
            w_rank[word] = i
        word = w_rank.items()
        check = NorvigSpellChecker(custom_dict=word)
        checking = "thai2fit"
    elif checking == "thai2fit" and worddict != None:
        pass
    else:
        checking = ""
        check = NorvigSpellChecker()
    #print(word_cut)
    X_test = _extract_features([(i, ) for i in word_cut])
    #print(X_test)
    y_ = crf.predict_single(X_test)
    x = [(word_cut[i], data) for i, data in enumerate(y_)]
    output = ""
    temp = ''
    #print(x)
    for i, b in enumerate(x):
        if i == len(x) - 1 and 'B' in b[1] and temp == 'B':
            output += "</คำผิด><คำผิด>" + b[0] + "</คำผิด>"
            temp = 'B'
        elif i == len(x) - 1 and 'B' in b[1]:
            output += "<คำผิด>" + b[0] + "</คำผิด>"
            temp = 'B'
        elif 'B-' in b[1] and temp == 'B':
            output += "</คำผิด><คำผิด>" + b[0]
            temp = 'B'
        elif 'B-' in b[1]:
            output += "<คำผิด>" + b[0]
            temp = 'B'
        elif 'O' in b[1] and temp == 'B':
            output += "</คำผิด>" + b[0]
            temp = 'O'
        elif i == len(x) - 1 and 'I' in b[1] and temp == 'B':
            output += b[0] + "</คำผิด>"
            temp = 'O'
        else:
            output += b[0]
    if autocorrect:
        f = "(<คำผิด>)(.*)(</คำผิด>)"
        output = output.replace("<คำผิด>", "|---|<คำผิด>|---|").replace(
            "</คำผิด>", "|---|</คำผิด>|---|")
        listall = output.split("|---|")
        i = 0
        output = ""
        ii = len(listall)
        while i < ii:
            if listall[i] == "<คำผิด>":
                output += check.correct(listall[i + 1])
                i += 3
            else:
                output += listall[i]
                i += 1
    return output