Python word_tokenizeの例、pythainlp.word_tokenize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: prepare.py プロジェクト: nozomiyamada/headline-generator

def tokenize(filename='thairath1.tsv'):
    """
    tokenize headline (line[1]) & description (line[2])
    save as txt with whitespace
    """
    path = '/Users/Nozomi/files/metonymy/'
    open_name = path + filename
    save_name = open_name.rsplit('.tsv')[0]
    open_file = open(open_name, 'r', encoding='utf-8')

    title_file = open(save_name + '_title.tsv', 'w', encoding='utf-8')
    description_file = open(save_name + '_description.tsv', 'w', encoding='utf-8')

    articles = csv.reader(open_file, delimiter='\t')
    write1 = csv.writer(title_file, lineterminator='\n', delimiter='\t')
    write2 = csv.writer(description_file, lineterminator='\n', delimiter='\t')

    for article in articles:
        if article[2] != '':
            ID = [article[0]]
            title = word_tokenize(article[1])
            description = word_tokenize(article[2])

            write1.writerow(ID + title)
            write2.writerow(ID + description)

    open_file.close()
    title_file.close()
    description_file.close()

コード例 #2

0

ファイルを表示

def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1), 'artagger')
    sentence2 = pos_tag(word_tokenize(sentence2), 'artagger')

    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]

    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

    score, count = 0.0, 0

    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        best_score = max([synset.path_similarity(ss) for ss in synsets2])

        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1

    # Average the values
    if count != 0:
        score /= count
    return score

コード例 #3

0

ファイルを表示

ファイル: __init__.py プロジェクト: PyThaiNLP/thainlp

 def change_word_tokenize(self, name):
     if self.dictlist == []:
         self.words = WordList(
             pythainlp.word_tokenize(self.text, engine=name))
     else:
         self.words = WordList(pythainlp.word_tokenize(
             self.text, self.dict))
     self.tags = pythainlp.pos_tag(self.words)

コード例 #4

0

ファイルを表示

    def compare(self, statement, other_statement):
        """
        Return the calculated similarity of two
        statements based on the Jaccard index.
        """
        from nltk.corpus import wordnet
        from pythainlp.tag import pos_tag
        from pythainlp.tokenize import word_tokenize
        import nltk
        import string

        a = statement.text.lower()
        b = other_statement.text.lower()

        # Get default English stopwords and extend with punctuation
        from pythainlp.corpus import stopwords
        stopwords = stopwords.words('thai') + nltk.corpus.stopwords.words(
            'english')
        stopwords.extend(string.punctuation)
        stopwords.append('')
        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

        def get_wordnet_pos(pos_tag):
            if pos_tag[1].startswith('J'):
                return (pos_tag[0], wordnet.ADJ)
            elif pos_tag[1].startswith('V'):
                return (pos_tag[0], wordnet.VERB)
            elif pos_tag[1].startswith('N'):
                return (pos_tag[0], wordnet.NOUN)
            elif pos_tag[1].startswith('R'):
                return (pos_tag[0], wordnet.ADV)
            else:
                return (pos_tag[0], wordnet.NOUN)

        ratio = 0
        pos_a = map(get_wordnet_pos, pos_tag(word_tokenize(a), 'artagger'))
        pos_b = map(get_wordnet_pos, pos_tag(word_tokenize(b), 'artagger'))
        lemma_a = [
            lemmatizer.lemmatize(token.strip(string.punctuation), pos)
            for token, pos in pos_a if pos == wordnet.NOUN
            and token.strip(string.punctuation) not in stopwords
        ]
        lemma_b = [
            lemmatizer.lemmatize(token.strip(string.punctuation), pos)
            for token, pos in pos_b if pos == wordnet.NOUN
            and token.strip(string.punctuation) not in stopwords
        ]

        # Calculate Jaccard similarity
        try:
            numerator = len(set(lemma_a).intersection(lemma_b))
            denominator = float(len(set(lemma_a).union(lemma_b)))
            ratio = numerator / denominator
        except Exception as e:
            print('Error', e)
        return ratio >= self.SIMILARITY_THRESHOLD

コード例 #5

0

ファイルを表示

ファイル: __init__.py プロジェクト: PyThaiNLP/thainlp

 def __init__(self, text, dictlist=[]):
     self.text = text
     self.dictlist = dictlist
     if self.dictlist == []:
         self.words = WordList(pythainlp.word_tokenize(self.text))
     else:
         self.dict = pythainlp.tokenize.dict_trie(self.dictlist)
         self.words = WordList(pythainlp.word_tokenize(
             self.text, self.dict))
     self.tags = pythainlp.pos_tag(self.words)
     self.romanize = [romanize_pythainlp(i) for i in self.words]
     self.word_counts = Counter(self.words)

コード例 #6

0

ファイルを表示

    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the closest synset distance.
        :rtype: float

        .. _wordnet: http://www.nltk.org/howto/wordnet.html
        .. _NLTK: http://www.nltk.org/
        """
        from nltk.corpus import wordnet
        from pythainlp.tokenize import word_tokenize
        from chatterbot import utils
        import itertools

        tokens1 = word_tokenize(statement.text.lower())
        tokens2 = word_tokenize(other_statement.text.lower())

        # Remove all stop words from the list of word tokens
        tokens1 = utils.remove_stopwords(tokens1, language='thai')
        tokens2 = utils.remove_stopwords(tokens2, language='thai')

        # The maximum possible similarity is an exact match
        # Because path_similarity returns a value between 0 and 1,
        # max_possible_similarity is the number of words in the longer
        # of the two input statements.
        max_possible_similarity = max(len(statement.text.split()),
                                      len(other_statement.text.split()))

        max_similarity = 0.0

        # Get the highest matching value for each possible combination of words
        for combination in itertools.product(*[tokens1, tokens2]):

            synset1 = wordnet.synsets(combination[0], lang="tha")
            synset2 = wordnet.synsets(combination[1], lang="tha")

            if synset1 and synset2:

                # Get the highest similarity for each combination of synsets
                for synset in itertools.product(*[synset1, synset2]):
                    similarity = synset[0].path_similarity(synset[1])

                    if similarity and (similarity > max_similarity):
                        max_similarity = similarity

        if max_possible_similarity == 0:
            return 0

        return max_similarity / max_possible_similarity

コード例 #7

0

ファイルを表示

ファイル: frequency.py プロジェクト: tinnawong/glex-service-python

 def selectLibraryFile(self, TextList):
     if self.LibraryNumber == 'glex':
         Key = {'text': TextList, 'useDict': self.Dictname}
         TextResponse = requests.get('http://127.0.0.1:8080/glex/segment',
                                     Key)
         cutWords = TextResponse.json()['results']
     elif self.LibraryNumber == 'deepcut':
         cutWords = deepcut.tokenize(TextList)
     elif self.LibraryNumber == 'tltk':
         cutWords = tltk.nlp.word_segment(TextList).split('|')
     elif self.LibraryNumber == 'newmm':
         cutWords = word_tokenize(TextList, engine='newmm')
     elif self.LibraryNumber == 'mm':
         cutWords = word_tokenize(TextList, engine='mm')
     return cutWords

コード例 #8

0

ファイルを表示

def process(lines, core, acc_result):
    print('core {} started'.format(core))
    pattern_count = {}
    for idx, line in enumerate(lines):
        sentence = clean.fixing(line)
        result = pyt.word_tokenize(sentence, engine='newmm')

        crit0 = [
            bool(char_repeat_pattern.search(token))
            and bool(thai_pattern.search(token)) for token in result
        ]
        res0 = [(result[idx - 1:idx + 1]) for idx, log in enumerate(crit0)
                if log]
        if any(crit0):
            for patt in res0:
                if len(patt) == 0 or patt[0] == ' ':
                    continue
                patt = '+'.join(patt)
                if patt in pattern_count:
                    pattern_count[patt] += 1
                else:
                    pattern_count[patt] = 1

    acc_result.append((core, pattern_count))
    print('core {} finished'.format(core))

コード例 #9

0

ファイルを表示

ファイル: utils.py プロジェクト: Semooze/dWise

def extract_word_from_message(data) -> List[str]:
    result: List = list()
    for sentence in data.message:
        words = word_tokenize(sentence, engine='newmm', keep_whitespace=False)
        striped_word = [ word.strip() for word in words if word.strip() != '']
        result.extend(striped_word)
    return result

コード例 #10

0

ファイルを表示

ファイル: __init__.py プロジェクト: wannaphongcom/pythainlp

 def tokenizer(self, text: str) -> List[str]:
     """
     :meth: tokenize text with a frozen newmm engine
     :param str text: text to tokenize
     :return: tokenized text
     """
     return word_tokenize(text, engine="ulmfit")

コード例 #11

0

ファイルを表示

def split_word(text):
    tokens = word_tokenize(text, engine='newmm')

    # # Remove stop words ภาษาไทย และภาษาอังกฤษ
    # tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]

    # Thai
    tokens_temp = []
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)

    tokens = tokens_temp

    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]

    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    # tokens_list = [split_word(txt) for txt in text_list]

    return tokens

コード例 #12

0

ファイルを表示

ファイル: markov_wordgen.py プロジェクト: wannaphong/essential-generators

    def _train_on_text(self, text):
        n = 3
        self.grams = {}
        gram_buffer = []
        text = ' '.join(word_tokenize(text))
        for letter in text:
            # letter = letter.lower()

            if letter not in ['\n', ' ']:
                gram_buffer.append(letter)
            else:
                gram_buffer.append(MarkovWordGenerator.stopword)

            if len(gram_buffer) >= n:
                as_tuple = tuple(gram_buffer)
                if as_tuple not in self.grams:
                    self.grams[as_tuple] = 0

                self.grams[as_tuple] += 1

                gram_buffer = gram_buffer[1:]

            if letter in ['\n', ' ', '*']:
                gram_buffer.clear()
                gram_buffer.append(MarkovWordGenerator.startword)

コード例 #13

0

ファイルを表示

ファイル: crawler.py プロジェクト: nongharunana/get_data_from_twitter

def crawler():
    count = 0
    for status in tweepy.Cursor(api.user_timeline,
                                screen_name=name,
                                tweet_mode="extended").items():
        if count > 4:
            break
        messages = status.full_text.splitlines()
        proc = ''
        matching = []
        noti = ''
        time = ''
        for i in messages:
            proc = word_tokenize(i, engine='newmm')
            matching = [
                s for s in proc if ('ขัดข้อง' in s) or ('ขออภัย' in s) or (
                    'ขณะนี้' in s) or ('ความไม่สะดวก' in s) or ('ตามปกติ' in s)
            ]

            print(matching)
            if len(matching) != 0:
                noti = status.full_text
                time = status.created_at
                # break
        if noti != '':
            break

        # f.write(f'index:{count} message:{status.full_text}')
        count += 1
    print(noti)
    print(time)
    return noti, time

コード例 #14

0

ファイルを表示

ファイル: word_count.py プロジェクト: mix2zeta/social-d

async def get_count_by_list(data: list,
                            return_value: str = 'wordcount') -> dict:
    wordcount = {}
    hashtag = {}
    mention = {}
    for row in data:
        word_list = word_tokenize(row['message'], keep_whitespace=False)

        for word in get_word_or_hashtag(word_list):
            if word[0] == '@' and return_value == 'mention':
                if word in mention.keys():
                    mention[word] += 1
                else:
                    mention[word] = 1
            elif word[0] == '#' and return_value == 'hashtag':
                if word in hashtag.keys():
                    hashtag[word] += 1
                else:
                    hashtag[word] = 1
            else:
                if word in wordcount.keys():
                    wordcount[word] += 1
                else:
                    wordcount[word] = 1

    if return_value == 'mention':
        return mention
    if return_value == 'hashtag':
        return hashtag
    return wordcount

コード例 #15

0

ファイルを表示

ファイル: prepare_data.py プロジェクト: tupleblog/wongnai-classifier

def predict_ensemble(test_df):
    """
    Predict input ``test_df`` with columns ``review`` and ``rating``
    """
    instances = [
        wongnai_predictor._dataset_reader.text_to_instance(
            word_tokenize(review)) for review in list(test_df.review)
    ]
    model_paths = glob('output_*/model.tar.gz')
    all_predicted_labels = []
    for model_path in model_paths:
        archive = load_archive(model_path)  # load trained model
        wongnai_predictor = Predictor.from_archive(archive,
                                                   'wongnai_predictor')
        predicted_labels = [
            int(
                wongnai_predictor.predict_instance(instance)
                ['predicted_label']) for instance in instances
        ]
        all_predicted_labels.append(predicted_labels)
    all_predicted_labels = np.array(all_predicted_labels)
    predicted_labels_vote = mode(np.array(all_predicted_labels).T,
                                 axis=-1).mode.ravel()
    test_df['rating'] = predicted_labels_vote
    return test_df.drop('review', axis=1)

コード例 #16

0

ファイルを表示

 def classify(self, title_text: str) -> str:
     feat_dic = {word: 1
                 for word in word_tokenize(title_text)
                 }  # make feature dictionary of one title
     result = self.model.predict(self.DV.transform(
         [feat_dic]))  # predict with model
     return result[0]  # result = array(['คุณภาพชีวิต'], dtype='<U12')

コード例 #17

0

ファイルを表示

ファイル: dan_topic_classifier.py プロジェクト: nozomiyamada/PA2-Text-Classification

    def classify(self, title_text: str) -> str:
        """
        tokenize title_text, get vectors of all words and append to the list
        if there are only UNK, returns zero vector

        title_text = 'ไปกินอาหาร'
        > tokenized_title = ['ไป', 'กิน', 'อาหาร']
        > vecs = [[11,2,5...], [3,-1,4...], [6,2,9...]]
        > np.mean(vecs, axis=0) = [2,1,5,...]
        """
        tokenized_title = word_tokenize(title_text)
        vecs = []  # make the list of each word vector
        for word in tokenized_title:
            if word in self.wv.vocab:  # append vector iff the word is in vocab
                vecs.append(self.wv[word])
        if vecs == []:  # if there is no vector in list, return [0,0,...0]
            mean = np.zeros((300))
        else:
            mean = np.mean(np.array(vecs),
                           axis=0)  # calculate mean along column
        result = self.model.predict_proba(np.array(
            [mean]))  # predict by model, result is 12 dim vector

        return self.i_to_l[np.argmax(
            result
        )]  # get the index of the most largest element, and convert to str

コード例 #18

0

ファイルを表示

ファイル: __init__.py プロジェクト: xemoe/pythainlp

 def tokenizer(text: str) -> List[str]:
     """
     :meth: tokenize text with a frozen newmm engine
     :param str text: text to tokenize
     :return: tokenized text
     """
     return word_tokenize(text, engine="ulmfit")

コード例 #19

0

ファイルを表示

def find_place(text):
    proc = word_tokenize(text.lower(), keep_whitespace=False)
    listToStr = ' '.join(map(str, proc))
    keyplace = keyword_processor2.extract_keywords(listToStr)
    if (keyplace == []):
        keyplace.append("-")
    return keyplace[0]

コード例 #20

0

ファイルを表示

ファイル: wongnai_predictor.py プロジェクト: tupleblog/wongnai-classifier

 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     comment = json_dict['comment']
     comment = comment.strip().replace('-', ' ')
     comment = ' '.join(comment.split())
     tokenized_comment = word_tokenize(comment)
     instance = self._dataset_reader.text_to_instance(tokenized_comment=tokenized_comment)
     return instance

コード例 #21

0

ファイルを表示

def split_word(text):
    # ตัดคำโดยใช้ dict ใน corpus ที่ผม edit ไป มันจะตัดเฉพาะเมนูอาหารที่ผมใส่ไปใน words.th.txt
    tokens = word_tokenize(text, engine='dict')

    # Remove stop words ภาษาไทย และภาษาอังกฤษ
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]

    # Thai
    tokens_temp = []
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)

    tokens = tokens_temp

    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]

    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

コード例 #22

0

ファイルを表示

    def tokenize(self, corpus, tokenizer=None):
        tokenized_corpus = []
        for sentence in corpus:
            sentence = self.text_preprocess(sentence)
            text = word_tokenize(sentence, keep_whitespace=False)
            tokenized_corpus.append(list(text))

        return tokenized_corpus

コード例 #23

0

ファイルを表示

ファイル: sequenceLabeler.py プロジェクト: wannaphong/numfa_server

def predict(storyId, sentence):
    tokenizedSentence = word_tokenize(sentence)
    taggedToken = posTagger(sentence)
    tagger = pycrfsuite.Tagger()
    tagger.open("{}/{}.model".format(app.config["MODELS_DIR"], storyId))
    predictedLabels = tagger.tag(sentToFeatures(taggedToken))
    extractedEntities = extractEntities(zip(tokenizedSentence,
                                            predictedLabels))
    return extractedEntities

コード例 #24

0

ファイルを表示

ファイル: Cosine_play.py プロジェクト: kmutt-cpe/Prediction-Model

def sentence_vectorizer(ss,dim=300,use_mean=True): 
    s = word_tokenize(ss)
    vec = np.zeros((1,dim))
    for word in s:
        if word in model.wv.index2word:
            vec+= model.wv.word_vec(word)
        else: pass
    if use_mean: vec /= len(s)
    return vec

コード例 #25

0

ファイルを表示

def make_word_tokenize(datas):

    list_data = []

    for data in datas:
        word_tokenized = word_tokenize(data, engine='deepcut')
        list_data.append(word_tokenized)

    return list_data

コード例 #26

0

ファイルを表示

    def stemmingSingle(self, text):
        result = word_tokenize(text)
        temp = result.copy()
        t = ''
        for i in result:
            if (i == 'จังหวัด' or i == 'ที่'):
                temp.remove(i)

        return temp[0]

コード例 #27

0

ファイルを表示

ファイル: app.py プロジェクト: 5730097921-JW/line-bot-nlp

def get_intention(sentence):
    data = word_tokenize(sentence)
    data = to_index(data)
    data = data[:49] + [0]*(49 - len(data))
    data = np.array([data])
    intention = intent.predict(data)
    # print(intention)
    intention = intention[0].argmax()
    # print(intention)
    return intention

コード例 #28

0

ファイルを表示

    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        text = message.get(attribute)

        if not self.case_sensitive:
            text = text.lower()
        words = word_tokenize(text)

        if not words:
            words = [text]

        return self._convert_words_to_tokens(words, text)

コード例 #29

0

ファイルを表示

ファイル: bot.py プロジェクト: Vanilla39/my-line-bot-v01

def reply(intent, text, reply_token, id, disname, req):

    if intent == 'รุ่นที่1_Akimoto Manatsu':
        cut = []
        cut = word_tokenize(text, keep_whitespace=False)
        print(cut)
        print(len(cut))

        members_docs = db.collection(u'DataMembers').document(
            u'tYybUVgsllwd5U2QliNE')
        docs = members_docs.get().to_dict()
        print(docs)
        bday = members_docs.get(field_paths={'Bday'}).to_dict().get('Bday')
        blood = members_docs.get(
            field_paths={'BloodType'}).to_dict().get('BloodType')
        gen = members_docs.get(field_paths={'Gen'}).to_dict().get('Gen')
        height = members_docs.get(
            field_paths={'Height'}).to_dict().get('Height')
        zodiac = members_docs.get(
            field_paths={'Zodiac'}).to_dict().get('Zodiac')
        print(bday, blood, gen, height, zodiac)

        for i in cut:
            if (i == "เกิด" or i == "วันเกิด"):
                text_message = TextSendMessage(
                    text='อะคิโมโตะ มานัตสึ รุ่นที่1 เกิดวันที่{}'.format(
                        bday))
            elif (i == "เลือด" or i == "หมู่โลหิต"):
                text_message = TextSendMessage(
                    text='อะคิโมโตะ มานัตสึ รุ่นที่1 กรุ๊ปเลือด{}'.format(
                        blood))
            elif (i == "สูง" or i == "ส่วนสูง"):
                text_message = TextSendMessage(
                    text='อะคิโมโตะ มานัตสึ รุ่นที่1 ส่วนสูง{}'.format(height))
            elif (i == "ราศี"):
                text_message = TextSendMessage(
                    text='อะคิโมโตะ มานัตสึ รุ่นที่1 ราศี{}'.format(zodiac))
        #text_message = TextSendMessage(text='Akimoto Manatsu \nเกิดวันที่ {} ราศี {} กรุ๊ปเลือด {} '.format(bday, zodiac, blood))
        line_bot_api.reply_message(reply_token, text_message)

    if intent == 'A_Test':
        # Note: Use of CollectionRef stream() is prefered to get()
        #song_ref = db.collection(u'DataSongs').where(u'Sname', u'==', u'Guru Guru Curtain').stream()
        doc_ref = db.collection(u'DataSongs').document(u'00Test')
        doc = doc_ref.get().to_dict()
        print(doc)
        songName = doc['Sname']
        albumName = doc['Aname']
        linkSong = doc['SLink']
        text_message = TextSendMessage(
            text=
            'ฟังเพลง Guru Guru Curtain จากอัลบั้ม{}\n คลิกเพื่อฟังเพลงที่Spotify{} '
            .format(albumName, linkSong))
        line_bot_api.reply_message(reply_token, text_message)

コード例 #30

0

ファイルを表示

ファイル: main_mini.py プロジェクト: yosiyoshi/IntelligenTXT

 def dsegth():
     m = Frame.m
     txt = m.get()
     seg = ptn.word_tokenize(txt, engine='deepcut')
     print(seg)
     pyperclip.copy(" ".join(seg))
     root12 = tk.Tk()
     root12.title('Result(DeepCutTH)')
     label12 = tk.Label(root12, text=seg, font=16)
     label12.pack(fill="x")
     root12.mainloop()

コード例 #31

0

ファイルを表示

def handle_message(event):
    text = event.message.text
    proc = word_tokenize(text, engine='newmm')
    matching = [s for s in proc if ('กิน' in s) or (
        'อาหาร' in s) or ('อะไร' in s)]
    if len(matching) != 0:
        i = randint(0, len(food_list)-1)
        line_bot_api.reply_message(
            event.reply_token, TextSendMessage(text=food_list[i]))
    else:
        line_bot_api.reply_message(event.reply_token, TextSendMessage(
            text='ต้องการสุ่มอาหารหรือเปล่า หากต้องการสุ่ม พิมพ์ กินอะไรดี'))

コード例 #32

0

ファイルを表示

ファイル: royin.py プロジェクト: wannaphongcom/pythainlp

def romanize(text: str) -> str:
    """
    Rendering Thai words in the Latin alphabet or "romanization",
    using the Royal Thai General System of Transcription (RTGS),
    which is the official system published by the Royal Institute of Thailand.
    ถอดเสียงภาษาไทยเป็นอักษรละติน
    :param str text: Thai text to be romanized
    :return: A string of Thai words rendered in the Latin alphabet.
    """
    words = word_tokenize(text)
    romanized_words = [_romanize(word) for word in words]

    return "".join(romanized_words)

コード例 #33

0

ファイルを表示

ファイル: text.py プロジェクト: codelucas/newspaper

 def candidate_words(self, stripped_input):
     import pythainlp
     tokens = pythainlp.word_tokenize(stripped_input)
     return tokens