Beispiel #1
0
def sentenceTokenize(inputSentence):
    # Tokenize
    tokenized = word_tokenize(inputSentence)
    newTokenize = []
    for w in tokenized:
        newTokenize += word_tokenize(w, engine='newmm')
    return " ".join(newTokenize)
Beispiel #2
0
    def time_question_features(self, text):
        """
        Provide an analysis of significant features in the string.
        """
        features = {}

        # A list of all words from the known sentences
        all_words = " ".join(self.positive + self.negative).split()

        # A list of the first word in each of the known sentence
        all_first_words = []
        for sentence in self.positive + self.negative:
            all_first_words.append(sentence.split(' ', 1)[0])

        for word in word_tokenize(text):
            features['first_word({})'.format(word)] = (word in all_first_words)

        for word in word_tokenize(text):
            features['contains({})'.format(word)] = (word in all_words)

        for letter in 'abcdefghijklmnopqrstuvwxyzกขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮุูึๆไำะัํี๊ฯโเ้็่๋า.แิื์ใๅ':
            features['count({})'.format(letter)] = text.lower().count(letter)
            features['has({})'.format(letter)] = (letter in text.lower())

        return features
def token_en2words(set_index, txt_test_data, engine='newmm'):
    # Dumps and load json data
    txt_test_data = json.dumps(txt_test_data)
    txt_test_data = json.loads(txt_test_data)
    temp = 0
    save_token = []

    for idx, _ in enumerate(range(len(txt_test_data))):
        # Tokenize entities
        if idx in set_index:
            e_token = txt_test_data[temp:idx]
            # Check empty text
            if len(e_token) > 0:
                # word tokenize
                # should rename variable 'words_tokenize'
                words_tokenize = []
                if engine in ['newmm', 'deepcut', 'attacut']:
                    words_tokenize = word_tokenize(e_token, engine=engine)
                elif engine is 'bpe':
                    pass
                else:
                    raise Exception('Tokenizer mismatch')
                words_tokenize = space_tokenizer(words_tokenize)
                save_token.extend(words_tokenize)
                temp = idx

    words_tokenize = word_tokenize(txt_test_data[temp:], engine=engine)
    words_tokenize = space_tokenizer(words_tokenize)
    save_token.extend(words_tokenize)
    return save_token
Beispiel #4
0
 def test_word_tokenize_newmm_longtext(self):
     self.assertIsInstance(
         word_tokenize(self.long_text, engine="newmm"), list
     )
     self.assertIsInstance(
         word_tokenize(self.long_text, engine="newmm-safe"), list
     )
def token_en2words(set_index, txt_test_data, engine='newmm'):
    # Dumps and load json data
    txt_test_data = json.dumps(txt_test_data)
    txt_test_data = json.loads(txt_test_data)
    temp = 0
    save_token = []

    for idx, _ in enumerate(range(len(txt_test_data))):

        # Tokenize entities
        if idx in set_index:
            e_token = txt_test_data[temp:idx]

            #Check empty text
            if (len(e_token) > 0):

                # Tokenize each sentence
                if (engine in [
                        'newmm', 'longest', 'deepcut', 'icu', 'ulmfit',
                        'attacut'
                ]):
                    words_tokenize = word_tokenize(e_token, engine=engine)
                else:
                    raise 'Tokenizer mismatch'
                save_token.extend(words_tokenize)
                temp = idx

    save_token.extend(word_tokenize(txt_test_data[temp:], engine=engine))
    return save_token
Beispiel #6
0
def tokenize(start_index, end_index, open_tsv='thairath1.tsv', write_tsv='tokenized1.tsv'):
    """
    tokenize headline (line[1]) & article (line[-1])
    """
    # make id list for checking duplicate
    file = open(write_tsv, 'r', encoding='utf-8')
    lines = list(csv.reader(file, delimiter='\t'))
    id_list = [line[0] for line in lines]
    file.close()
    
    open_file = open(open_tsv, 'r', encoding='utf-8')
    write_file = open(write_tsv, 'a', encoding='utf-8')  # append mode
    lines = list(csv.reader(open_file, delimiter='\t'))
    writer = csv.writer(write_file, lineterminator='\n', delimiter='\t')
    
    for line in lines[start_index: end_index]:
        if line[0] not in id_list:

            headline = [line[0], '\t'.join(word_tokenize(line[1]))]
            writer.writerow(headline)
            article = [line[0], '\t'.join(word_tokenize(line[-1]))]
            writer.writerow(article)
    
    open_file.close()
    write_file.close()
Beispiel #7
0
 def can_process(self, statement):
     if 'พยากรณ์' in word_tokenize(
             statement.text) or 'พยากรณ์อากาศ' in word_tokenize(
                 statement.text) or 'อากาศ' in word_tokenize(
                     statement.text):
         return True
     else:
         return False
Beispiel #8
0
    def test_word_tokenize_mm(self):
        self.assertEqual(multi_cut.segment(None), [])
        self.assertEqual(multi_cut.segment(""), [])
        self.assertEqual(word_tokenize("", engine="mm"), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS"))
 def test_deepcut(self):
     self.assertEqual(deepcut.segment(None), [])
     self.assertEqual(deepcut.segment(""), [])
     self.assertIsNotNone(deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE))
     self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"]))
     self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut"))
     self.assertIsNotNone(
         word_tokenize("ทดสอบ",
                       engine="deepcut",
                       custom_dict=DEFAULT_WORD_DICT_TRIE))
def main():
    try:

        with open('assets/type_1_refactor_naming_elements/original.txt',
                  'r',
                  encoding='utf8') as original:
            original_contents = original.readlines()
            original_contents = [
                ''.join(content.strip().split('|'))
                for content in original_contents
            ]

        with open('assets/type_1_refactor_naming_elements/naming_list.txt',
                  'r',
                  encoding='utf8') as source:
            contents = source.readlines()
            contents = [content.strip() for content in contents]

        tokenized_contents = [
            list(word_tokenize(content)) for content in contents
        ]
        original_tokenized_contents = [
            list(word_tokenize(content)) for content in original_contents
        ]

        with open('assets/type_1_refactor_naming_elements/cosine_values.txt',
                  'w',
                  encoding='utf8') as result:
            for sentence in (tokenized_contents):
                for original_sentence in (original_tokenized_contents):
                    vector1 = (count(sentence))
                    vector2 = (count(original_sentence))
                    cosine = get_cosine(vector1, vector2)
                    if cosine >= 0.3:
                        result.write('"' + (''.join(sentence)) + '"' +
                                     ' COMPARED TO ' + '"' +
                                     (''.join(original_sentence)) + '"' +
                                     ' = ' + str(cosine))
                        result.write('\n')

        with open(
                'assets/type_1_refactor_naming_elements/cosine_values_only_value.txt',
                'w',
                encoding='utf8') as result:
            for sentence in (tokenized_contents):
                for original_sentence in (original_tokenized_contents):
                    vector1 = (count(sentence))
                    vector2 = (count(original_sentence))
                    cosine = get_cosine(vector1, vector2)
                    if cosine >= 0.3:
                        result.write(str(cosine))
                        result.write('\n')

    except Exception as e:
        print(e)
Beispiel #11
0
def json_example():
    req_data = request.get_json()
    message = req_data['message']
    removeSpecialChars = message.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"})

    classifier = initialize()
    tokenize = word_tokenize(removeSpecialChars)
    label = classifier.classify(extract_features(word_tokenize(removeSpecialChars)))

    return jsonify({'sentiment_label': label,
                    'word_tokenize': tokenize})
Beispiel #12
0
 def test_tag(self):
     self.assertEqual(
         pos_tag(word_tokenize("คุณกำลังประชุม"), engine='old'),
         [('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
     self.assertEqual(
         pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
         [[('ผม', 'PPRS'), ('กิน', 'VACT'),
           ('ข้าว', 'NCMN')], [('แมว', 'NCMN'), ('วิ่ง', 'VACT')]])
     if sys.version_info >= (3, 4):
         self.assertEqual(
             str(type(pos_tag(word_tokenize("ผมรักคุณ"),
                              engine='artagger'))), "<class 'list'>")
Beispiel #13
0
    def test_word_tokenize_mm(self):
        self.assertEqual(multi_cut.segment(None), [])
        self.assertEqual(multi_cut.segment(""), [])
        self.assertEqual(word_tokenize("", engine="mm"), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(multi_cut.mmcut("ทดสอบ"))

        self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS"))
        self.assertEqual(multi_cut.find_all_segment(None), [])
Beispiel #14
0
 def test_segment_newmm(self):
     self.assertEqual(
         word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย', engine='newmm'),
         [u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย'])
     self.assertEqual(
         word_tokenize('สวัสดีครับ สบายดีไหมครับ', engine='newmm'),
         [u'สวัสดี', u'ครับ', u' ', u'สบายดี', u'ไหม', u'ครับ'])
     self.assertEqual(word_tokenize('จุ๋มง่วงนอนยัง', engine='newmm'),
                      [u'จุ๋ม', u'ง่วงนอน', u'ยัง'])
     self.assertEqual(word_tokenize('จุ๋มง่วง', engine='newmm'),
                      [u'จุ๋ม', u'ง่วง'])
     self.assertEqual(
         word_tokenize('จุ๋ม   ง่วง', engine='newmm', whitespaces=False),
         [u'จุ๋ม', u'ง่วง'])
def initialize():
    #open example positive and negative reviews ---------------------------------
    pos_reviews_file = codecs.open('pos.txt', 'r', "utf-8")
    neg_reviews_file = codecs.open('neg.txt', 'r', "utf-8")
    neu_reviews_file = codecs.open('neu.txt', 'r', "utf-8")

    #store positive reviews into a list -----------------------------------------
    pos_reviews = []
    for each_review in pos_reviews_file:
        each_review = ' '.join(word_tokenize(each_review))
        if each_review.endswith('\n'):
            each_review = each_review[:-1]
        if not each_review == '':
            pos_reviews.append([each_review, 'pos'])

    #store negative reviews into a list -----------------------------------------
    neg_reviews = []
    for each_review in neg_reviews_file:
        each_review = ' '.join(word_tokenize(each_review))
        if each_review.endswith('\n'):
            each_review = each_review[:-1]
        if not each_review == '':
            neg_reviews.append([each_review, 'neg'])

    neu_reviews = []
    for each_review in neu_reviews_file:
        each_review = ' '.join(word_tokenize(each_review))
        if each_review.endswith('\n'):
            each_review = each_review[:-1]
        if not each_review == '':
            neu_reviews.append([each_review, 'neu'])

    #remove words whose length is < 3 and combine both lists --------------------
    all_reviews = []
    for (review, sentiment) in pos_reviews + neg_reviews + neu_reviews:
        reviews_filtered = [
            w.lower() for w in word_tokenize(review) if len(w) >= 3
        ]
        all_reviews.append((reviews_filtered, sentiment))

    #get feature set-------------------------------------------------------------
    global review_features
    review_features = get_word_features(get_words_in_reviews(all_reviews))
    #review_features = remove_punctuation(review_features)

    #get training set -----------------------------------------------------------
    training_set = nltk.classify.apply_features(extract_features, all_reviews)
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    return classifier
def process_one_pantip(text_list, min_seq_length=5, max_seq_length=300, sep_func=sent_tokenize):
    word_counts = []
    texts = []
    for text in text_list:
        text = text.strip()
        word_count = len(word_tokenize(text))
        if word_count > max_seq_length:
            sub_text = [process_transformers(i) for i in sep_func(text)]
            sub_word_count = [len(word_tokenize(i)) for i in sub_text]
            texts+=sub_text
            word_counts+=sub_word_count
        else:
            texts.append(process_transformers(text))
            word_counts.append(word_count)
    return pd.DataFrame({"text": texts, "wc": word_counts})
Beispiel #17
0
def pos_tag_api():
    sent = request.args.get('sent', 0, type=str)
    txt = ""
    for i in sent.split('<br>'):
        txt += " ".join("%s/%s" % tup
                        for tup in pos_tag(word_tokenize(i))) + "<br>"
    return jsonify(result=txt)
Beispiel #18
0
def text2conll2002(text):
    text = text.replace(' ', '<space>')
    text = text.replace("''", '"')
    text = text.replace("’", '"').replace("‘", '"')
    tag = tokenizer.tokenize(text)
    j = 0
    conll2002 = ""
    for tagopen, text, tagclose in tag:
        word_cut = word_tokenize(text)
        i = 0
        while i < len(word_cut):
            if word_cut[i] == "''" or word_cut[i] == '"': pass
            elif i == 0 and tagopen != 'word':
                conll2002 += word_cut[i]
                #conll2002+='\t'+pos_tag2[j][1]
                conll2002 += '\t' + 'B-' + 'NP'  #tagopen
            elif tagopen != 'word':
                conll2002 += word_cut[i]
                #conll2002+='\t'+pos_tag2[j][1]
                conll2002 += '\t' + 'I-' + 'NP'  #tagopen
            else:
                conll2002 += word_cut[i]
                #conll2002+='\t'+pos_tag2[j][1]
                conll2002 += '\t' + 'O'
            conll2002 += '\n'
            #j+=1
            i += 1
    return postag(conll2002)
Beispiel #19
0
 def test_word_tokenize_icu(self):
     self.assertEqual(tokenize_pyicu.segment(None), [])
     self.assertEqual(tokenize_pyicu.segment(""), [])
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
         ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
     )
Beispiel #20
0
 def test_word_tokenize_deepcut(self):
     self.assertEqual(tokenize_deepcut.segment(None), [])
     self.assertEqual(tokenize_deepcut.segment(""), [])
     self.assertIsNotNone(
         tokenize_deepcut.segment("ทดสอบ", DEFAULT_DICT_TRIE))
     self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", ["ทด", "สอบ"]))
     self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut"))
Beispiel #21
0
def getTokensesFromPandas(dataSample, columnName):
    tokenses = []
    for index, row in dataSample.iterrows():
        tokens = word_tokenize(row[columnName])
        tokenses.append(tokens)

    return tokenses
 def test_word_tokenize_attacut(self):
     self.assertEqual(attacut.segment(None), [])
     self.assertEqual(attacut.segment(""), [])
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"),
         ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
     )
Beispiel #23
0
def romanization(data, engine='royin'):
    """
	:param str data: Thai text to be romanized
	:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
	:return: English (more or less) text that spells out how the Thai text should read.
	"""
    listword = []
    if engine == 'royin':
        from .royin import romanization
    elif engine == 'pyicu':
        from .pyicu import romanization
    elif engine == 'thai2rom':
        from pythainlp.romanization.thai2rom import thai2rom
        thai = thai2rom()
        return thai.romanization(data)
    else:
        raise Exception("error no have engine.")
    try:
        word_list = word_tokenize(data)
        i = 0
        while i < len(word_list):
            listword.append(romanization(word_list[i]))
            i += 1
    except:
        listword = [romanization(data)]

    return ''.join(listword)
Beispiel #24
0
def getVector(sentence):
    # print('getVector')
    words = word_tokenize(sentence)
    # words = word_tokenize(sentence, engine='icu')
    # vectors = map(lambda x:model.wv(x), words)

    # print('tokenized')
    # print(words)

    # words= list(map(lambda v: removeNoVocab(v), words))
    # words= [removeNoVocab(w) for w in words]
    # words= removeNoVocab(words)

    # print('removed No Vocabs')
    # print(words)

    vectors = []
    for w in words:
        if w in w2vModel.wv:
            vectors.append(w2vModel.wv[w])

    if len(vectors) == 0:
        return []

    # vectors= list(map(lambda v: removeNoVocab(v), vectors))

    # print("vector=")
    # print(vectors)

    npArray = np.array(vectors)
    avg = np.mean(npArray, axis=0)

    return avg
Beispiel #25
0
def createBOW(ls_txt, corpus):

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    BOW_t = [list() for i in range(len(ls_txt))]
    l = 0
    for i in ls_txt:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW_t[l].append(tmp.count(j))
                tmp.remove(j)
            else:
                BOW_t[l].append(0)

        if len(tmp) != 0:
            BOW_t[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW_t[l].append(0)
        l += 1

    # corpus_t = corpus.append('Other')
    # ch = pd.DataFrame({
    #     'train':corpus,
    #     'target':BOW_t[0]
    # })
    # ch
    # predictiontree = dtree.predict(BOW_t)
    return list(BOW_t)
Beispiel #26
0
def prepro(txt, wanto):
    cut_cum = word_tokenize(txt)
    ff = list(
        filter(
            lambda x: x not in ("http", "https", ":", " ", '://', 't', '.',
                                'co', 'RT', '\n', '...'), cut_cum))
    return list(filter(lambda x: x in wanto, ff))
Beispiel #27
0
def receive_message():
    if request.method == 'GET':
        """Before allowing people to message your bot, Facebook has implemented a verify token
        that confirms all requests that your bot receives came from Facebook."""
        token_sent = request.args.get("hub.verify_token")
        return verify_fb_token(token_sent)
    #if the request was not get, it must be POST and we can just proceed with sending a message back to user
    else:
        # get whatever message a user sent the bot
        output = request.get_json()
        for event in output['entry']:
            print(event)
            messaging = event['messaging']  #messaging
            for message in messaging:
                if message.get('message'):
                    #Facebook Messenger ID for user so we know where to send response back to
                    recipient_id = message['sender']['id']
                    if message['message'].get('text'):
                        msg_input = message['message'].get('text')
                        dict_count_thai = isthai(msg_input)
                        response_sent_text = get_message()
                        if dict_count_thai['thai'] > 0:
                            list_tokenized = word_tokenize(msg_input,
                                                           engine='newmm')
                            response_sent_text = ' '.join(list_tokenized)
                        send_message(recipient_id, response_sent_text)
                        #send_message("1834191463278166", response_sent_text)
                    #if user sends us a GIF, photo,video, or any other non-text item
                    if message['message'].get('attachments'):
                        response_sent_nontext = get_message()
                        send_message(recipient_id, response_sent_nontext)
    return "Message Processed"
Beispiel #28
0
def sentence_vectorizer(text: str, use_mean: bool = True):
    """
    Get sentence vector from text
    If a word is not in the vocabulary, KeyError will be raised.

    :param string text: text input
    :param boolean use_mean: if `True` use mean of all word vectors else use summation

    :return: sentence vector of given input text
    """
    words = word_tokenize(text, engine="ulmfit")
    vec = np.zeros((1, WV_DIM))

    for word in words:
        if word == " ":
            word = "xxspace"
        elif word == "\n":
            word = "xxeol"

        if word in _MODEL.wv.index2word:
            vec += _MODEL.wv.word_vec(word)
        else:
            pass

    if use_mean:
        vec /= len(words)

    return vec
Beispiel #29
0
 def tokenize(self,text):
     """
     :meth: tokenize text with selected engine
     :param str text: text to tokenize
     :return: tokenized text
     """
     return [t for t in word_tokenize(self.sub_br(text),engine=self.engine)]
 def test_icu(self):
     self.assertEqual(pyicu.segment(None), [])
     self.assertEqual(pyicu.segment(""), [])
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
         ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
     )
Beispiel #31
0
def text2conll2002(text, pos=True):
    """
    ใช้แปลงข้อความให้กลายเป็น conll2002
    """
    text = toolner_to_tag(text)
    text = text.replace("''", '"')
    text = text.replace("’", '"').replace("‘", '"')  #.replace('"',"")
    tag = tokenizer.tokenize(text)
    j = 0
    conll2002 = ""
    for tagopen, text, tagclose in tag:
        word_cut = word_tokenize(text, engine=thaicut)  # ใช้ตัวตัดคำ newmm
        i = 0
        txt5 = ""
        while i < len(word_cut):
            if word_cut[i] == "''" or word_cut[i] == '"': pass
            elif i == 0 and tagopen != 'word':
                txt5 += word_cut[i]
                txt5 += '\t' + 'B-' + tagopen
            elif tagopen != 'word':
                txt5 += word_cut[i]
                txt5 += '\t' + 'I-' + tagopen
            else:
                txt5 += word_cut[i]
                txt5 += '\t' + 'O'
            txt5 += '\n'
            #j+=1
            i += 1
        conll2002 += txt5
    if pos == False:
        return conll2002
    return postag(conll2002)
Beispiel #32
0
def split_word(text):
    th_stop = tuple(thai_stopwords())
    en_stop = tuple(get_stop_words('en'))
    p_stemmer = PorterStemmer()

    tokens = word_tokenize(text,engine='newmm')
    
    # Remove Thai and English stop words
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # Find Thai and English stem words
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
    
    # Thai
    tokens_temp=[]
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)
    
    tokens = tokens_temp
    
    # Remove numbers
    tokens = [i for i in tokens if not i.isnumeric()]
    
    # Remove space
    tokens = [i for i in tokens if not ' ' in i]

    return tokens
Beispiel #33
0
    def get_ner(self,text,postag=True):
        """
        Get NER from Thai NER.

        :param string text: thai text
        :param boolean postag: get postag (True) or get not postag (False)

        :return: list NER.

        **Example**::
            >>> from pythainlp.ner import thainer
            >>> ner=thainer()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'JSBR', 'O'), (' ', 'NCMN', 'O'), ('15', 'NCNM', 'B-DATE'), (' ', 'NCMN', 'I-DATE'), ('ก.ย.', 'CMTR', 'I-DATE'), (' ', 'NCMN', 'I-DATE'), ('61', 'NCNM', 'I-DATE'), (' ', 'NCMN', 'O'), ('ทดสอบ', 'VACT', 'O'), ('ระบบ', 'NCMN', 'O'), ('เวลา', 'NCMN', 'O'), (' ', 'NCMN', 'O'), ('14', 'NCNM', 'B-TIME'), (':', 'PUNC', 'I-TIME'), ('49', 'NCNM', 'I-TIME'), (' ', 'NCMN', 'I-TIME'), ('น.', 'CMTR', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",postag=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.word_cut=word_tokenize(text,engine=thaicut)
        self.list_word=pos_tag(self.word_cut,engine='perceptron')
        self.X_test = self.extract_features([(data,self.list_word[i][1]) for i,data in enumerate(self.word_cut)])
        self.y_=self.crf.predict_single(self.X_test)
        if postag:
            return [(self.word_cut[i],self.list_word[i][1],data) for i,data in enumerate(self.y_)]
        else:
            return [(self.word_cut[i],data) for i,data in enumerate(self.y_)]
Beispiel #34
0
 def tokenize(self, text):
     """
     :meth: tokenize text with selected engine
     :param str text: text to tokenize
     :return: tokenized text
     """
     return [t for t in word_tokenize(self.sub_br(text), engine=self.engine)]
Beispiel #35
0
def process_nlp_prediction(request):
    text = request.POST.get("text")
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"…", "", text)
    for c in string.punctuation:
        text = re.sub(r"\{}".format(c), "", text)
    text = " ".join(text.split())
    language = detect(text)
    if language == "th":
        vocabulary = pickle.load(open("././nlp-vocabulary.pkl", "rb"))
        NLP_model = pickle.load(open("././nlp-model.pkl", "rb"))
        featurized_test_sentence = {
            i: (i in word_tokenize(text.lower()))
            for i in vocabulary
        }
        response = {
            "test_sent": text,
            "result": NLP_model.classify(featurized_test_sentence),
        }
    else:
        response = {
            "test_sent":
            text,
            "result":
            "Sorry!! This language is not supported, please send a message in Thai.",
        }
    return JsonResponse(response)
Beispiel #36
0
 def make_doc(self, text):
     try:
         from pythainlp.tokenize import word_tokenize
     except ImportError:
         raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
                           "https://github.com/wannaphongcom/pythainlp/")
     words = [x for x in list(word_tokenize(text,"newmm"))]
     return Doc(self.vocab, words=words, spaces=[False]*len(words))
Beispiel #37
0
 def test_word_tokenize_longest(self):
     self.assertEqual(longest.segment(None), [])
     self.assertEqual(longest.segment(""), [])
     self.assertIsNotNone(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"))
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
         ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
     )
Beispiel #38
0
def sentence_vectorizer(ss,dim=300,use_mean=False):
    s = word_tokenize(ss)
    vec = np.zeros((1,dim))
    for word in s:
        if word in get_model().wv.index2word:
            vec+= get_model().wv.word_vec(word)
        else: pass
    if use_mean: vec /= len(s)
    return(vec)
Beispiel #39
0
def document_vector(ss, m, stoi,tok_engine='newmm'):
    s = word_tokenize(ss)
    t = LongTensor([stoi[i] for i in s]).view(-1,1).cuda()
    t = Variable(t,volatile=False)
    m.reset()
    pred,*_ = m[0](t)
    #get average of last lstm layer along bptt
    res = to_np(torch.mean(pred[-1],0).view(-1))
    return(res)
Beispiel #40
0
    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud"))
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(None, engine="artagger"))
        self.assertIsNotNone(pos_tag([], engine="artagger"))
        self.assertIsNotNone(pos_tag(tokens, engine="artagger"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )
Beispiel #41
0
def get_sentiment(ss,return_score=False):
    s = word_tokenize(ss)
    t = LongTensor([stoi[i] for i in s]).view(-1,1).cpu()
    t = Variable(t,volatile=False)
    m.reset()
    pred,*_ = m(t)
    result = pred.data.cpu().numpy().reshape(-1)
    if return_score:
        return(softmax(result))
    else:
        return(np.argmax(result))
Beispiel #42
0
 def summarize(self, text, n,tokenize):
     sents = sent_tokenize(text)
     word_sent = [word_tokenize(s,tokenize) for s in sents]
     self._freq = self._compute_frequencies(word_sent)
     ranking = defaultdict(int)
     for i, sent in enumerate(word_sent):
         for w in sent:
             if w in self._freq:
                 ranking[i] += self._freq[w]
     sents_idx = self._rank(ranking,n)
     return [sents[j] for j in sents_idx]
Beispiel #43
0
 def test_word_tokenize_newmm(self):
     self.assertEqual(newmm.segment(None), [])
     self.assertEqual(newmm.segment(""), [])
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
         ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
     )
     self.assertEqual(
         word_tokenize(
             "สวัสดีครับ สบายดีไหมครับ", engine="newmm", keep_whitespace=True
         ),
         ["สวัสดี", "ครับ", " ", "สบายดี", "ไหม", "ครับ"],
     )
     self.assertEqual(
         word_tokenize("จุ๋มง่วงนอนยัง", engine="newmm"), ["จุ๋ม", "ง่วงนอน", "ยัง"]
     )
     self.assertEqual(word_tokenize("จุ๋มง่วง", engine="newmm"), ["จุ๋ม", "ง่วง"])
     self.assertEqual(
         word_tokenize("จุ๋ม   ง่วง", engine="newmm", keep_whitespace=False),
         ["จุ๋ม", "ง่วง"],
     )
Beispiel #44
0
    def summarize(self, text: str, n: int, tokenizer: str = "newmm") -> List[str]:
        sents = sent_tokenize(text)
        word_tokenized_sents = [word_tokenize(sent, engine=tokenizer) for sent in sents]
        self.__freq = self.__compute_frequencies(word_tokenized_sents)
        ranking = defaultdict(int)

        for i, sent in enumerate(word_tokenized_sents):
            for w in sent:
                if w in self.__freq:
                    ranking[i] += self.__freq[w]
        summaries_idx = self.__rank(ranking, n)

        return [sents[j] for j in summaries_idx]
Beispiel #45
0
def sentiment(text):
	"""
	sentiment ภาษาไทย
	ใช้ข้อมูลจาก https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/
	รับค่าสตริง str คืนค่า pos , neg"""
	with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm:
		vocabulary = dill.load(in_strm)
	in_strm.close()
	with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm:
		classifier = dill.load(in_strm)
	in_strm.close()
	text=set(word_tokenize(text))-set(stopwords.words('thai'))
	featurized_test_sentence =  {i:(i in text) for i in vocabulary}
	return classifier.classify(featurized_test_sentence)
Beispiel #46
0
def document_vector(ss, m, stoi,tok_engine='newmm'):
    """
    :meth: `document_vector` get document vector using pretrained ULMFit model
    :param str ss: sentence to extract embeddings
    :param m: pyTorch model
    :param dict stoi: string-to-integer dict e.g. {'_unk_':0, '_pad_':1,'first_word':2,'second_word':3,...}
    :param str tok_engine: tokenization engine (recommend using `newmm` if you are using pretrained ULMFit model)
    :return: `numpy.array` of document vector sized 300
    """
    s = word_tokenize(ss)
    t = LongTensor([stoi[i] for i in s]).view(-1,1).cuda()
    t = Variable(t,volatile=False)
    m.reset()
    pred,*_ = m[0](t)
    #get average of last lstm layer along bptt
    res = to_np(torch.mean(pred[-1],0).view(-1))
    return(res)
Beispiel #47
0
    def get_ner(
        self, text: str, pos: bool = True
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        Get named-entities in text

        :param string text: Thai text
        :param boolean pos: get Part-Of-Speech tag (True) or get not (False)

        :return: list of strings with name labels (and part-of-speech tags)

        **Example**::
            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'),
            (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'),
            ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'),
            (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'),
            (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
        self.__pos_tags = pos_tag(
            self.__tokens, engine="perceptron", corpus="orchid_ud"
        )
        self.__x_test = self.__extract_features(self.__pos_tags)
        self.__y = self.crf.predict_single(self.__x_test)

        if pos:
            return [
                (self.__pos_tags[i][0], self.__pos_tags[i][1], data)
                for i, data in enumerate(self.__y)
            ]

        return [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)]
Beispiel #48
0
def sentiment(text, engine='old'):
	"""
	:param str text: thai text
	:param str engine: sentiment analysis engine (old or ulmfit)
	:return: pos or neg

	**Example**::
		>>> from pythainlp.sentiment import sentiment
		>>> text="วันนี้อากาศดีจัง"
		>>> sentiment(text)
		'pos'
		>>> sentiment(text,'ulmfit')
		'pos'
		>>> text="วันนี้อารมณ์เสียมาก"
		>>> sentiment(text)
		'neg'
		>>> sentiment(text,'ulmfit')
		'neg'
	"""
	if engine=='old':
		with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm:
			vocabulary = dill.load(in_strm)
		with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm:
			classifier = dill.load(in_strm)
		text=set(word_tokenize(text))-set(stopwords.words('thai'))
		featurized_test_sentence =  {i:(i in text) for i in vocabulary}
		return classifier.classify(featurized_test_sentence)
	elif engine=='ulmfit':
		from pythainlp.sentiment import ulmfit_sent
		tag=ulmfit_sent.get_sentiment(text)
		sa=""
		if tag==0:
			sa="neg"
		else:
			sa="pos"
		return sa
	else:
		raise Exception("error no have engine.")
Beispiel #49
0
def romanization(data,engine='royin'):
	"""
	:param str data: Thai text to be romanized
	:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
	:return: English (more or less) text that spells out how the Thai text should read.
	"""
	word_list=word_tokenize(data)
	listword=[]
	i=0
	if engine=='royin':
    		from .royin import romanization
	elif engine=='pyicu':
    		from .pyicu import romanization
	elif engine=='thai2rom':
    		from pythainlp.romanization.thai2rom import thai2rom
    		thai=thai2rom()
    		return thai.romanization(data)
	else:
    		raise Exception("error no have engine.")
	while i<len(word_list):
		listword.append(romanization(word_list[i]))
		i+=1
	return ''.join(listword)
Beispiel #50
0
	def test_keywords(self):
		self.assertEqual(find_keyword(word_tokenize("แมวกินปลาอร่อยรู้ไหมว่าแมวเป็นแมวรู้ไหมนะแมว",engine='newmm')),{u'แมว': 4})
Beispiel #51
0
	def test_segment_mm(self):
		self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='mm'),[u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย'])
Beispiel #52
0
from pythainlp.tokenize import word_tokenize
textTest ="ฉันรักคนไทยที่กินข้าว"
print(word_tokenize("นี่ข้าวใคร"))
Beispiel #53
0
			dill.dump(marisa_trie.Trie(data),dill_file)
		dill_file.close()
	with open(path,'rb') as dill_file:
		data=dill.load(dill_file)
	dill_file.close()
	return data
def test_segmenter(segmenter, test):
    '''
    ระบบทดสอบการตัดคำ
    '''
    words = test
    result = segmenter
    correct = (result == words)
    if not correct:
        print ('expected', words)
        print('got     ', result)
    return correct
if __name__ == "__main__":
    from pythainlp.tokenize import word_tokenize
    text="ฉันเป็นคนและฉันรักภาษาไทยฉันอยู่ประเทศไทยฉันศึกษาอยู่ที่มหาวิทยาลัยพายุฝนกำลังมาต้องหลบแล้วล่ะคุณสบายดีไหม"
    test=["ฉัน","เป็น","คน","และ","ฉัน","รัก","ภาษาไทย","ฉัน","อยู่","ประเทศไทย","ฉัน","ศึกษา","อยู่","ที่","มหาวิทยาลัย","พายุฝน","กำลัง","มา","ต้อง","หลบ","แล้ว","ล่ะ","คุณ","สบายดี","ไหม"]
    print("icu :")
    pyicu=test_segmenter(word_tokenize(text,engine='icu'),test)
    print(pyicu)
    print("newmm :")
    newmm=test_segmenter(word_tokenize(text,engine='newmm'),test)
    print(newmm)
    print("mm :")
    mm=test_segmenter(word_tokenize(text,engine='mm'),test)
    print(mm)
Beispiel #54
0
	def test_tag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
		self.assertEqual(pos_tag_sents([["ผม","กิน","ข้าว"],["แมว","วิ่ง"]]),[[('ผม', 'PPRS'), ('กิน', 'VACT'), ('ข้าว', 'NCMN')], [('แมว', 'NCMN'), ('วิ่ง', 'VACT')]])
		if sys.version_info >= (3,4):
			self.assertEqual(str(type(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'))),"<class 'list'>")
Beispiel #55
0
 def test_keywords(self):
     word_list = word_tokenize(
         "แมวกินปลาอร่อยรู้ไหมว่าแมวเป็นแมวรู้ไหมนะแมว", engine="newmm"
     )
     self.assertEqual(find_keyword(word_list), {"แมว": 4})
Beispiel #56
0
	def test_segment_newmm(self):
		self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='newmm'),[u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย'])
		self.assertEqual(word_tokenize('สวัสดีครับ สบายดีไหมครับ',engine='newmm'),[u'สวัสดี', u'ครับ', u' ', u'สบายดี', u'ไหม', u'ครับ'])
		self.assertEqual(word_tokenize('จุ๋มง่วงนอนยัง',engine='newmm'),[u'จุ๋ม', u'ง่วงนอน', u'ยัง'])
		self.assertEqual(word_tokenize('จุ๋มง่วง',engine='newmm'),[u'จุ๋ม', u'ง่วง'])
		self.assertEqual(word_tokenize('จุ๋ม   ง่วง',engine='newmm',whitespaces=False),[u'จุ๋ม', u'ง่วง'])
Beispiel #57
0
	def test_segment_Wordcut(self):
		if sys.version_info >= (3,4) and sys.platform!="win32" and sys.platform!="win64":
			self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='wordcutpy'),[u'ฉัน', u'รัก', u'ภาษา', u'ไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คน', u'ไทย'])
Beispiel #58
0
def Text(str1):
	if isinstance(str1,list) == False:
		str1=word_tokenize(str(str1))
	return nltk.Text(str1)
Beispiel #59
0
	def test_segment_longest_matching(self):
		self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='longest-matching'),[u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย'])
Beispiel #60
0
 def tokenize(self,x):
     return [t for t in word_tokenize(self.sub_br(x),engine=self.engine)]