コード例 #1
0
def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:
    if not text or not isinstance(text, str):
        return []

    if custom_dict:
        if isinstance(custom_dict, Trie):
            custom_dict = list(custom_dict)

        return tokenize(text, custom_dict)

    return tokenize(text)
コード例 #2
0
ファイル: deepcut.py プロジェクト: wannaphongcom/pythainlp
def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:
    if not text or not isinstance(text, str):
        return []

    if custom_dict:
        if isinstance(custom_dict, Trie):
            custom_dict = list(custom_dict)

        return deepcut.tokenize(text, custom_dict)

    return deepcut.tokenize(text)
コード例 #3
0
def testZ(value):
 print(type(value))
 if value != '':
  list_word = deepcut.tokenize(value)
  posList_word = pos_tag(list_word, corpus='orchid_ud')
  return posList_word
 else : 

  value='empty'
  list_word = deepcut.tokenize(value)
  posList_word = pos_tag(list_word, corpus='orchid_ud')
  return posList_word
コード例 #4
0
ファイル: all.py プロジェクト: totaeza31/MyProject
def DeepcutandTLTK():

    valuesDeepcutandTLTK = []
    text = "ทดสอบตัวตัดคำ ssนะจ้ะdsdsd/*-"
    # cut word
    cleans1 = str(text)
    cleans = cleans1.translate(
        {ord(c): ""
         for c in "\"'!@#$ %^&*,[](){};:./<>?|`~-=_+\\"})
    list_word = deepcut.tokenize(cleans)

    strlist_word = str(list_word)
    replaces =strlist_word.replace("[","") \
                          .replace("'","") \
                          .replace("]","") \
                          .replace(" ","")

    pos = tltk.nlp.pos_tag(replaces)

    # POS   Replace ใหม่อีกครั้งเพราะ เอาคำที่ตัดไปใช้ต่อใน tltk (tltk เอาคำที่ไม่ตัด[ข้อความปกติ]มาตัดด้วยก็เลยให้คั่นด้วย , จะได้ตัดของdeepcut )
    strpos = str(pos)
    cleanPOS = strpos.replace("(',', 'PUNCT'), ", "") \
                     .replace("[[","[") \
                     .replace("]]","]")
    valuesDeepcutandTLTK.append(cleanPOS)
    return valuesDeepcutandTLTK
コード例 #5
0
def address_to_token(address: dict):
    """
    Transform address dictionary to a list of tokens

    Input
    -----
    >>> address = {
        "text": ...,
        "labels": [[start1, stop1, label1], [start2, stop2, label2]]
    }

    Output
    ------
    >>> [(token1, label1), (token2, label2), ...]
    """
    if address["labels"] != []:
        tokens = []
        s = 0
        for token in deepcut.tokenize(address["text"]):
            start = s
            stop = s + len(token)

            label = "O"
            for s, st, c in address["labels"]:
                if range_intersect(range(start, stop), range(s, st)):
                    label = c
            tokens.append((token, label))
            s = stop
        return tokens
    else:
        return None
コード例 #6
0
ファイル: test.py プロジェクト: kawisornk/guessongnova
def listen():
    cnt = Counter()
    karaoke_dict = load_obj("karaoke_dict_new")

    # Record Audio
    r = sr.Recognizer()
    with sr.Microphone() as source:
        # print("ร้องเพลงสิิ!")
        audio = r.listen(source, phrase_time_limit=15)

    # Speech recognition using Google Speech Recognition
    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        # instead of `r.recognize_google(audio)`
        text = r.recognize_google(audio, language='th-TH')
        # print("You said: " + text)
        # print("Guessing the song....")
        tokens = deepcut.tokenize(text)
        for j in range(len(tokens) - 6):
            words = "".join(tokens[j:j + 6])
            if words in karaoke_dict:
                cnt[karaoke_dict[words]] += 1
        return text + str(cnt)
    except sr.UnknownValueError:
        return "คุณเป็นนักร้องเสียงเพี้ยนนนนนนนน"
    except sr.RequestError as e:
        return "พังจ้าาา; {0}".format(e)
コード例 #7
0
def preprocess():
    global news_path, output_path, tmp_path
    read_news_fromfile(news_path)

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    inp = tmp_path
    outp = output_path
    space = ' '
    i = 0
    inputfile = open(inp, 'r')
    output = open(outp, 'w')

    for line in inputfile.readlines():
        text = deepcut.tokenize(line)
        list1 = space.join(text)
        output.write((list1))
        i += 1
        if (i % 100 == 0) or (i <= 10):
            logger.info("Saved " + str(i) + " articles")
    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
コード例 #8
0
    def handle_oov(self, embeddings, X, words):
        oov_vecs_created = 0
        info_created_words = {}
        info_oov_words = {}

        # creating a set of OOV words
        oov_words = set()

        for query in X:
            for query_word in query:
                if query_word not in words:
                    oov_words.add(query_word)

        # iterating through OOV words to get AVG vectors for them
        for ds_word in oov_words:
            tokens = deepcut.tokenize(ds_word)
            in_voc_tokens = [token for token in tokens if token in embeddings]

            ## if we found word-parts in the emb - use their vectors (avg) to represent the OOV word
            if in_voc_tokens:
                token_vecs = [embeddings.get(t) for t in in_voc_tokens]
                embeddings[ds_word] = np.mean(token_vecs, axis=0)
                oov_vecs_created += 1
                info_created_words[ds_word] = in_voc_tokens
            else:
                info_oov_words[ds_word] = tokens

        logger.debug('All OOV words after deepcut:')
        logger.debug(info_oov_words)
        logger.debug('All "created"/replaced words by deepcut:')
        logger.debug(info_created_words)
コード例 #9
0
def remove_stopword(text):
    """
    remove stopword
    :return:
    """
    words = {
        'จะ', 'เเล้ว', 'ได้', 'อัน', 'ว่า', 'ที่', 'จึง', 'จาก', 'เป็น', 'ไป',
        'หรือ', 'นั้น', 'อาจ', 'ซึ่ง', 'ก็', 'มา', 'กับ', 'ไว้', 'ทั้งๆที่',
        'น่า', 'ก่อน', 'ทำ', 'โดย', 'นีั', 'ไร', 'ของ', 'ขอ', 'ว่า', 'เเค่',
        'กัน', 'ก็', 'เพื่อ', 'ละ', 'คือ', 'เเละ', 'ด้วย', 'จาก', 'จึง', 'ใน',
        'ๆ', 'ของ', 'ครั้ง', 'เมื่อ', 'ต่อ', 'นี้', '!', 'ทั้ง', 'มักจะ',
        'ของ', 'เนื่องจาก', 'กับ', 'ดังนี้', 'เข้า'
    }

    stop_words = set(words)
    word_tokens = deepcut.tokenize(text, custom_dict="custom_dict.txt")
    filter_sence = [w for w in word_tokens if not w in stop_words]
    filter_sence = []

    for w in word_tokens:
        if w not in stop_words:
            filter_sence.append(w)

    word = ''.join(filter_sence)
    return word
コード例 #10
0
def grammar(s, indent):
    for i in range(len(firstpriority)):
        if firstpriority[i] in s:
            s = s.replace(firstpriority[i], ' ' + firstpriority[i] + ' ')
    x = deepcut.tokenize(s)
    newarrayfordeepcut = []
    for i in x:
        if i == ' ' or i == '':
            pass
        else:
            newarrayfordeepcut.append(i)
    ans = []
    stringans = ''
    check = False
    for i in newarrayfordeepcut:
        if check:
            stringans += i
            continue
        if i != 'ข้อความ':
            check = False
            ans.append(i)
        else:
            check = True
            ans.append(i)
    ans.append(stringans)
    ans = mergecarefulword(ans)
    before_detect = []
    before_df = ans
    for i in ans:
        before_detect.append(dialogflow_api.detect_intent_texts(i))
    ans = detection.tran(before_detect, indent)
    return (ans[0], ans[1], before_df, before_detect)
コード例 #11
0
def word_segment_identify_tag(text):
    """
    function skip tag this function when they found <tag> function will skip <tag>
    :param text:
    :return:
    """
    pattern = r"(<[^<]วันที่>[^<]*</[^<]วันที่>)"
    matches = regex.finditer(pattern, text, regex.MULTILINE)
    match_i = []
    for matchNum, match in enumerate(matches, start=1):
        match_i.append(match.start())  # เก็บตำแหน่ง tag ตัวแรกที่เจอ
        match_i.append(match.end())  # เก็บตำแหน่ง tag ตัวสุดท้ายที่เจอ

    # print(match_i) #ดูข้อมูลเริ่มต้น สุดท้ายของ tag
    str_s = ''  # สร้างตัวแปรมาเก็บ string ที่ไม่มี tag
    index_match = 0  # สร้างตัวแปรเพื่อเช็ค index match_i
    str_tag = ''  # สร้างตัวแปรมาเก็บ string ที่มี tag
    print(len(match_i))  # check index tag
    for i in range(len(text)):
        if index_match <= len(match_i):
            if index_match % 2 == 0:  # if index_match % 2 = 0
                if index_match < len(
                        match_i):  # if index_match < length for match_i
                    if match_i[
                            index_match] == i:  # if match_i[index_match] = i
                        index_match = index_match + 1  # ให้ทำการเพิ่มค่า index_match
                        str_tag += text[
                            i]  # ใส่ข้อมูลแรกของ tag ลงไปใน str_tag
                        str_s += ' ='  # ใส่ช่องว่างให้ตัวสุดท้ายก่อนที่จะเจอ tag
                    else:
                        str_s += text[
                            i]  # else ให้เอาค่า str ตำแหน่ง i ไปใส่ใน str_s
                else:
                    str_s += text[i]  # เก็บ str แถวสุดท้ายหลังจาก tag
            elif index_match % 2 != 0:
                if match_i[index_match] == i:  # if match_i[index_match] = i
                    index_match += index_match  # ให้ทำการเพิ่มค่า index_match
                    str_tag += '\n'  # ใส่ช่องว่างให้ตัวสุดท้ายให้หลัง tag
                else:
                    str_tag += text[
                        i]  # else ให้เอาค่า str ตำแหน่ง i ไปใส่ใน str_tag

    tag_split = (str_tag.split('\n'))

    word_cut = deepcut.tokenize(
        str_s, custom_dict='dictionary/custom_dict/custom_dict.txt')

    ind = 0
    for i in range(len(word_cut)):
        if word_cut[i] == '=':
            word_cut[i] = tag_split[ind]
            ind += ind

    word = ''

    for i in range(len(word_cut)):
        if word_cut[i] != '=':
            word = ('|'.join(word_cut))

    return word
コード例 #12
0
ファイル: kum_puan.py プロジェクト: WisTiCeJEnT/Hannaeae
def puan_kum(word):
    full_word = ''
    middle = ''
    list_of_word = deepcut.tokenize(word,
                                    custom_dict=[
                                        'สวี', 'สวัส', 'ดี', 'อะ', 'ไร', 'ทำ',
                                        'เรอ', 'เบลอ', 'ละ', 'ฟัน', 'นะ'
                                    ])
    first_word = list_of_word[0]
    # print(first_word)
    last_word = list_of_word[-1]
    # print(last_word)

    f_spliter_word1, f_spliter_word2 = check_spliter(first_word)
    l_spliter_word1, l_spliter_word2 = check_spliter(last_word)
    if (f_spliter_word1
            == l_spliter_word1) and (f_spliter_word2
                                     == l_spliter_word2) and (f_spliter_word1
                                                              is not None):
        list_of_word = [f_spliter_word1, f_spliter_word2]
        first_word = f_spliter_word1
        last_word = l_spliter_word2
    else:
        if f_spliter_word1 is not None:
            del list_of_word[0]
            list_of_word = [f_spliter_word1, f_spliter_word2] + list_of_word
            first_word = f_spliter_word1

        if l_spliter_word1 is not None:
            del list_of_word[-1]
            list_of_word = list_of_word + [l_spliter_word1, l_spliter_word2]
            last_word = l_spliter_word2

    if len(list_of_word) == 1:
        return word

    first_alpha, f_start, f_end = find_alpha(first_word)
    # print(find_alpha(first_word))
    last_alpha, l_start, l_end = find_alpha(last_word)
    # print(find_alpha(last_word))

    new_first_word_list = list(last_word)
    # print(new_first_word_list)
    new_last_word_list = list(first_word)
    # print(new_last_word_list)

    if l_end - l_start == 2:
        del new_first_word_list[l_end - 1]
    new_first_word_list[l_start] = first_alpha

    if f_end - f_start == 2:
        del new_last_word_list[f_end - 1]
    new_last_word_list[f_start] = last_alpha

    for i in range(1, len(list_of_word) - 1):
        middle = middle + list_of_word[i]

    full_word = ''.join(new_first_word_list) + middle
    full_word = full_word + ''.join(new_last_word_list)
    return full_word
コード例 #13
0
ファイル: chatbot.py プロジェクト: JMRbot/jamaree
def Chatbot(input_data):
    checkword = []
    ##    print('user_w',input_data)
    input_data = deepcut.tokenize(
        input_data)  ##ตัดคำที่รับเข้ามาด้วย deepcut    ##['สวัสดี']
    if input_data[0] != "สวัสดี":
        for check in input_data:
            if check in word_to_int_input:
                #print('check',check)
                checkword.append(check)
##        print('w',checkword)
        checkword = [word_to_int_input[word] for word in checkword]
        checkword = np.array([checkword])
        checkword = sequence.pad_sequences(checkword,
                                           maxlen=40,
                                           padding='post')
        checkword = one_hot_encode(checkword, encoded_length)
        checkword = array(checkword)
        ##แก้   target = predict_sequence(infenc, infdec, checkword, 24, encoded_length)
        target = predict_sequence(infenc, infdec, checkword, 9, encoded_length)
        #print('target ',target)
        #print('bot',invert(target))
        words = ""
        for word in invert(target):
            words = words + word
        input_data = ""
        return words
    else:
        words = "สวัสดี"
        return words
    print("bot_m = ", words)
コード例 #14
0
def word_to_vec():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    documents = [
        'ฉันรักภาษาไทยเพราะฉันเป็นคนไทยและฉันเป็นคนไทย',
        'ฉันเป็นนักเรียนที่ชื่นชอบวิทยาศาสตร์และเทคโนโลยี',
        'ฉันไม่ใช่โปรแกรมเมอร์เพราะฉันทำมากกว่าคิดเขียนพัฒนาโปรแกรมทดสอบโปรแกรม',
        'ฉันชื่นชอบวิทยาศาสตร์ชอบค้นคว้าตั้งสมมุติฐานและหาคำตอบ'
    ]
    texts = [list(deepcut.tokenize(i)) for i in documents]
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]
    dictionary = corpora.Dictionary(texts)
    dictionary.save(
        '/Users/jirayutk./Project/SeniorProject/word2vec/tmp/deerwester.dict'
    )  # store the dictionary, for future reference
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize(
        '/Users/jirayutk./Project/SeniorProject/word2vec/tmp/deerwester.mm',
        corpus)  # store to disk, for later use
    print(texts)
    pprint(corpus)
コード例 #15
0
ファイル: prepare_data.py プロジェクト: kanpanluk/NLPcontest
def prepare_data():
    import deepcut
    import json

    input = open('input.txt', 'r', encoding='utf-8')
    ans = open('ans.txt', 'r', encoding='utf-8')
    input_token = []
    for i in input:
        i = i.split('::')[1]
        i = i.replace('\n', '')
        input_token.append([deepcut.tokenize(i)])

    n = 0
    for i in ans:
        i = i.split('::')[1]
        i = i.replace('\n', '')
        if i == 'H':
            i = 0
        elif i == 'P':
            i = 1
        elif i == 'M':
            i = 2

        input_token[n].insert(0, i)
        print(input_token[n])
        n += 1

    with open('data.json', 'w', encoding='utf-8') as file:
        json.dump(input_token, file, ensure_ascii=False)
コード例 #16
0
ファイル: preprocessing.py プロジェクト: narusornproject/News
def createBag(dataFrame):
    bagWord = []
    for x in dataFrame['Header']:
        new_str = CleanText(x)
        """ Cut words """
        bagWord.append(CutStopWord(deepcut.tokenize(new_str)))
    return bagWord
コード例 #17
0
def Opencsv(opens):

    with open(f'./FileCSV1/{opens}_clean_translated.csv',
              encoding="utf8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)
        values = []
        print("Procesing in pos_tag > ", opens)

        for row in reader:

            list_word = deepcut.tokenize(row[1])
            test = str(list_word)
            clean = test.replace("'", "")
            clean2 = clean.replace(" ", "")
            clean3 = clean2.replace(",,", ",")

            i = tltk.nlp.pos_tag(clean3)

            test2 = str(i)

            clean3 = test2.replace("(',', 'PUNCT'),", "")
            clean4 = clean3.replace(", ('<s/>', 'PUNCT')", "")
            clean5 = clean4.replace("('[', 'SYM'),", "")
            clean6 = clean5.replace(", (']', 'SYM')", "")
            clean7 = clean6.replace("  ", "")
            clean8 = clean7.replace(", (',.]', 'ADV'),", "")
            values.append(clean8)

        return values
コード例 #18
0
 def countSentence(self, sentence, tagging):
     if self.language_name == 'en' or tagging:
         for word in sentence.split(' '):
             self.countWords(word)
     if self.language_name == 'th' and not tagging:
         for word in deepcut.tokenize(sentence):
             self.countWords(word)
コード例 #19
0
ファイル: client.py プロジェクト: Steap/SIXEcho
 def __init__(self, api_key=None, host_url=None, max_workers=1):
     """
     Initial sixecho
     Attributes:
         api_key(string)       - Optional : api_key generate from sixecho
         host_url(string)      - Optional : is sixecho domain
     """
     self.api_key = api_key
     deepcut.tokenize("Welcome")  # Load library
     if host_url is not None:
         if host_url.endswith("/"):
             host_url = host_url[:-1]
         self.host_url = host_url
     self.array_words = []
     self.min_hash = MinHash(num_perm=128)
     self.max_workers = max_workers
     self.sha256 = ""
コード例 #20
0
def token(txt, txtnew):
     with open(txt, "r", encoding='utf-8') as f:
          with open(txtnew, "a", encoding='utf-8') as f1:
               for line in f:
                    s = deepcut.tokenize(line)
                    m = ' '.join(s)
               f1.write(m)
          f1.close()
          f.close()
コード例 #21
0
def word_segment(text):
    """
    word segmentation
    :param text:
    :return: text
    """
    text = deepcut.tokenize(text, custom_dict="custom_dict/custom_dict.txt")
    text = "|".join(text)
    return text
コード例 #22
0
def deepcuts(datas):
    list_word = []

    for word in datas:
        word_cut = deepcut.tokenize(word)
        list_word.append(word_cut)

    numpy_join = np.concatenate(list_word)
    return list(numpy_join)
コード例 #23
0
ファイル: main.py プロジェクト: adadesions/MeetNotify
def upgraded_filter(message, counters):
    sample = deepcut.tokenize(message)
    for word in sample:
        for test_word in counters.keys():
            if word == test_word:
                counters[test_word] += 1
    result = any(filter(lambda x: x >= 2, counters.values()))

    return result
コード例 #24
0
 def get_deepcut_segmented(self):
     """
     This function returns a clean string that is the output of applying deepcut on unsegmented version of line
     """
     deepcut_out = deepcut.tokenize(self.unsegmented)
     out_line = "|"
     for word in deepcut_out:
         out_line += word + "|"
     return out_line
コード例 #25
0
def main(model="original"):
    dest = "tokenised-with-%s-model.txt" % (model)
    with open(BEST_PATH, "r") as fr, \
         open(dest, "w") as fw:
        lines = fr.readlines()
        for l in tqdm(lines):
            tokens = deepcut.tokenize(l.strip())
            fw.write("%s\n" % "|".join(tokens))
    print("Result is saved to %s" % dest)
コード例 #26
0
ファイル: preprocessing.py プロジェクト: narusornproject/News
def Query(str, dictionary, tf_idf):
    query_doc = [w for w in CutStopWord(
        deepcut.tokenize(CleanText(str)))]
    print(query_doc)
    query_doc_bow = dictionary.doc2bow(query_doc)
    print(query_doc_bow)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    print(query_doc_tf_idf)
    return query_doc_tf_idf
コード例 #27
0
def prepare_for_predict(input_questions):
    q_input = []
    cleansing(input_questions)
    tokenized_input_1 = deepcut.tokenize(input_questions)
    for sentence in tokenized_input_1:
        q_input.append(sentence)
    q_input = word_index(tokenized_input_1)
    q_input = pad_sequences(q_input, maxlen=max_seq_length)
    return q_input
コード例 #28
0
 def tokenize(self, text_list):
     """
     Tokenize Thai lyrics using deepcut
     """
     import deepcut
     words = []
     for lyric in tqdm(text_list):
         words.extend(deepcut.tokenize(lyric))
     return words
コード例 #29
0
 def first():
     #i เปน q แรก
     b = usinputcur()
     kamkorn = usinputoutcur()
     yol = deepcut.tokenize(kamkorn)
     i = yol[0]
     r = [x for x in range(10)]
     rat1 = i + " เป็นคำประเภท " + str(r[1])
     rat2 = i + " เป็นคำประเภท " + str(r[2])
     rat3 = i + " เป็นคำประเภท " + str(r[3])
     rat4 = i + " เป็นคำประเภท " + str(r[4])
     rat5 = i + " เป็นคำประเภท " + str(r[5])
     rat6 = i + " เป็นคำประเภท " + str(r[6])
     rat7 = i + " เป็นคำประเภท " + str(r[7])
     rat8 = i + " เป็นคำประเภท " + str(r[8])
     rat9 = i + " เป็นคำประเภท " + str(r[9])
     if b == rat1:
         # เก็บค่า r ให้ i ในตาราง pocha
         bogprapet(i, r[1])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[1])
         return n
     elif b == rat2:
         bogprapet(i, r[2])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[2])
         return n
     elif b == rat3:
         bogprapet(i, r[3])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[3])
         return n
     elif b == rat4:
         bogprapet(i, r[4])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[4])
         return n
     elif b == rat5:
         bogprapet(i, r[5])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[5])
         return n
     elif b == rat6:
         bogprapet(i, r[6])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[6])
         return n
     elif b == rat7:
         bogprapet(i, r[7])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[7])
         return n
     elif b == rat8:
         bogprapet(i, r[8])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[8])
         return n
     elif b == rat9:
         bogprapet(i, r[9])
         n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[9])
         return n
     else:
         return kwam()
コード例 #30
0
def tokenize_thai(text):
    tokens = deepcut.tokenize(text)
    # return ' '.join(pieces)
    content_buff = ""
    for word in tokens:
        # print(word)
        # word1=re.sub(r'([0-9]+)[ ]([.|,])[ ]([0-9]+)', r'\1\2\3', word)
        content_buff = content_buff + " " + word

    content_buff = ' '.join(content_buff.split())
    return( content_buff.strip() )
コード例 #31
0
async def tokenize(websocket, path):
    try:
        while True:
            text = await websocket.recv()

            # TODO: Use queue to control maximum concurrency?
            toks = deepcut.tokenize(text)

            await websocket.send(json.dumps(toks))
    except websockets.exceptions.ConnectionClosed as e:
        print("CLOSE")
コード例 #32
0
ファイル: deepcut.py プロジェクト: zkan/pythainlp
def segment(text):
    return deepcut.tokenize(text)