def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]: if not text or not isinstance(text, str): return [] if custom_dict: if isinstance(custom_dict, Trie): custom_dict = list(custom_dict) return tokenize(text, custom_dict) return tokenize(text)
def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]: if not text or not isinstance(text, str): return [] if custom_dict: if isinstance(custom_dict, Trie): custom_dict = list(custom_dict) return deepcut.tokenize(text, custom_dict) return deepcut.tokenize(text)
def testZ(value): print(type(value)) if value != '': list_word = deepcut.tokenize(value) posList_word = pos_tag(list_word, corpus='orchid_ud') return posList_word else : value='empty' list_word = deepcut.tokenize(value) posList_word = pos_tag(list_word, corpus='orchid_ud') return posList_word
def DeepcutandTLTK(): valuesDeepcutandTLTK = [] text = "ทดสอบตัวตัดคำ ssนะจ้ะdsdsd/*-" # cut word cleans1 = str(text) cleans = cleans1.translate( {ord(c): "" for c in "\"'!@#$ %^&*,[](){};:./<>?|`~-=_+\\"}) list_word = deepcut.tokenize(cleans) strlist_word = str(list_word) replaces =strlist_word.replace("[","") \ .replace("'","") \ .replace("]","") \ .replace(" ","") pos = tltk.nlp.pos_tag(replaces) # POS Replace ใหม่อีกครั้งเพราะ เอาคำที่ตัดไปใช้ต่อใน tltk (tltk เอาคำที่ไม่ตัด[ข้อความปกติ]มาตัดด้วยก็เลยให้คั่นด้วย , จะได้ตัดของdeepcut ) strpos = str(pos) cleanPOS = strpos.replace("(',', 'PUNCT'), ", "") \ .replace("[[","[") \ .replace("]]","]") valuesDeepcutandTLTK.append(cleanPOS) return valuesDeepcutandTLTK
def address_to_token(address: dict): """ Transform address dictionary to a list of tokens Input ----- >>> address = { "text": ..., "labels": [[start1, stop1, label1], [start2, stop2, label2]] } Output ------ >>> [(token1, label1), (token2, label2), ...] """ if address["labels"] != []: tokens = [] s = 0 for token in deepcut.tokenize(address["text"]): start = s stop = s + len(token) label = "O" for s, st, c in address["labels"]: if range_intersect(range(start, stop), range(s, st)): label = c tokens.append((token, label)) s = stop return tokens else: return None
def listen(): cnt = Counter() karaoke_dict = load_obj("karaoke_dict_new") # Record Audio r = sr.Recognizer() with sr.Microphone() as source: # print("ร้องเพลงสิิ!") audio = r.listen(source, phrase_time_limit=15) # Speech recognition using Google Speech Recognition try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` text = r.recognize_google(audio, language='th-TH') # print("You said: " + text) # print("Guessing the song....") tokens = deepcut.tokenize(text) for j in range(len(tokens) - 6): words = "".join(tokens[j:j + 6]) if words in karaoke_dict: cnt[karaoke_dict[words]] += 1 return text + str(cnt) except sr.UnknownValueError: return "คุณเป็นนักร้องเสียงเพี้ยนนนนนนนน" except sr.RequestError as e: return "พังจ้าาา; {0}".format(e)
def preprocess(): global news_path, output_path, tmp_path read_news_fromfile(news_path) program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments inp = tmp_path outp = output_path space = ' ' i = 0 inputfile = open(inp, 'r') output = open(outp, 'w') for line in inputfile.readlines(): text = deepcut.tokenize(line) list1 = space.join(text) output.write((list1)) i += 1 if (i % 100 == 0) or (i <= 10): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def handle_oov(self, embeddings, X, words): oov_vecs_created = 0 info_created_words = {} info_oov_words = {} # creating a set of OOV words oov_words = set() for query in X: for query_word in query: if query_word not in words: oov_words.add(query_word) # iterating through OOV words to get AVG vectors for them for ds_word in oov_words: tokens = deepcut.tokenize(ds_word) in_voc_tokens = [token for token in tokens if token in embeddings] ## if we found word-parts in the emb - use their vectors (avg) to represent the OOV word if in_voc_tokens: token_vecs = [embeddings.get(t) for t in in_voc_tokens] embeddings[ds_word] = np.mean(token_vecs, axis=0) oov_vecs_created += 1 info_created_words[ds_word] = in_voc_tokens else: info_oov_words[ds_word] = tokens logger.debug('All OOV words after deepcut:') logger.debug(info_oov_words) logger.debug('All "created"/replaced words by deepcut:') logger.debug(info_created_words)
def remove_stopword(text): """ remove stopword :return: """ words = { 'จะ', 'เเล้ว', 'ได้', 'อัน', 'ว่า', 'ที่', 'จึง', 'จาก', 'เป็น', 'ไป', 'หรือ', 'นั้น', 'อาจ', 'ซึ่ง', 'ก็', 'มา', 'กับ', 'ไว้', 'ทั้งๆที่', 'น่า', 'ก่อน', 'ทำ', 'โดย', 'นีั', 'ไร', 'ของ', 'ขอ', 'ว่า', 'เเค่', 'กัน', 'ก็', 'เพื่อ', 'ละ', 'คือ', 'เเละ', 'ด้วย', 'จาก', 'จึง', 'ใน', 'ๆ', 'ของ', 'ครั้ง', 'เมื่อ', 'ต่อ', 'นี้', '!', 'ทั้ง', 'มักจะ', 'ของ', 'เนื่องจาก', 'กับ', 'ดังนี้', 'เข้า' } stop_words = set(words) word_tokens = deepcut.tokenize(text, custom_dict="custom_dict.txt") filter_sence = [w for w in word_tokens if not w in stop_words] filter_sence = [] for w in word_tokens: if w not in stop_words: filter_sence.append(w) word = ''.join(filter_sence) return word
def grammar(s, indent): for i in range(len(firstpriority)): if firstpriority[i] in s: s = s.replace(firstpriority[i], ' ' + firstpriority[i] + ' ') x = deepcut.tokenize(s) newarrayfordeepcut = [] for i in x: if i == ' ' or i == '': pass else: newarrayfordeepcut.append(i) ans = [] stringans = '' check = False for i in newarrayfordeepcut: if check: stringans += i continue if i != 'ข้อความ': check = False ans.append(i) else: check = True ans.append(i) ans.append(stringans) ans = mergecarefulword(ans) before_detect = [] before_df = ans for i in ans: before_detect.append(dialogflow_api.detect_intent_texts(i)) ans = detection.tran(before_detect, indent) return (ans[0], ans[1], before_df, before_detect)
def word_segment_identify_tag(text): """ function skip tag this function when they found <tag> function will skip <tag> :param text: :return: """ pattern = r"(<[^<]วันที่>[^<]*</[^<]วันที่>)" matches = regex.finditer(pattern, text, regex.MULTILINE) match_i = [] for matchNum, match in enumerate(matches, start=1): match_i.append(match.start()) # เก็บตำแหน่ง tag ตัวแรกที่เจอ match_i.append(match.end()) # เก็บตำแหน่ง tag ตัวสุดท้ายที่เจอ # print(match_i) #ดูข้อมูลเริ่มต้น สุดท้ายของ tag str_s = '' # สร้างตัวแปรมาเก็บ string ที่ไม่มี tag index_match = 0 # สร้างตัวแปรเพื่อเช็ค index match_i str_tag = '' # สร้างตัวแปรมาเก็บ string ที่มี tag print(len(match_i)) # check index tag for i in range(len(text)): if index_match <= len(match_i): if index_match % 2 == 0: # if index_match % 2 = 0 if index_match < len( match_i): # if index_match < length for match_i if match_i[ index_match] == i: # if match_i[index_match] = i index_match = index_match + 1 # ให้ทำการเพิ่มค่า index_match str_tag += text[ i] # ใส่ข้อมูลแรกของ tag ลงไปใน str_tag str_s += ' =' # ใส่ช่องว่างให้ตัวสุดท้ายก่อนที่จะเจอ tag else: str_s += text[ i] # else ให้เอาค่า str ตำแหน่ง i ไปใส่ใน str_s else: str_s += text[i] # เก็บ str แถวสุดท้ายหลังจาก tag elif index_match % 2 != 0: if match_i[index_match] == i: # if match_i[index_match] = i index_match += index_match # ให้ทำการเพิ่มค่า index_match str_tag += '\n' # ใส่ช่องว่างให้ตัวสุดท้ายให้หลัง tag else: str_tag += text[ i] # else ให้เอาค่า str ตำแหน่ง i ไปใส่ใน str_tag tag_split = (str_tag.split('\n')) word_cut = deepcut.tokenize( str_s, custom_dict='dictionary/custom_dict/custom_dict.txt') ind = 0 for i in range(len(word_cut)): if word_cut[i] == '=': word_cut[i] = tag_split[ind] ind += ind word = '' for i in range(len(word_cut)): if word_cut[i] != '=': word = ('|'.join(word_cut)) return word
def puan_kum(word): full_word = '' middle = '' list_of_word = deepcut.tokenize(word, custom_dict=[ 'สวี', 'สวัส', 'ดี', 'อะ', 'ไร', 'ทำ', 'เรอ', 'เบลอ', 'ละ', 'ฟัน', 'นะ' ]) first_word = list_of_word[0] # print(first_word) last_word = list_of_word[-1] # print(last_word) f_spliter_word1, f_spliter_word2 = check_spliter(first_word) l_spliter_word1, l_spliter_word2 = check_spliter(last_word) if (f_spliter_word1 == l_spliter_word1) and (f_spliter_word2 == l_spliter_word2) and (f_spliter_word1 is not None): list_of_word = [f_spliter_word1, f_spliter_word2] first_word = f_spliter_word1 last_word = l_spliter_word2 else: if f_spliter_word1 is not None: del list_of_word[0] list_of_word = [f_spliter_word1, f_spliter_word2] + list_of_word first_word = f_spliter_word1 if l_spliter_word1 is not None: del list_of_word[-1] list_of_word = list_of_word + [l_spliter_word1, l_spliter_word2] last_word = l_spliter_word2 if len(list_of_word) == 1: return word first_alpha, f_start, f_end = find_alpha(first_word) # print(find_alpha(first_word)) last_alpha, l_start, l_end = find_alpha(last_word) # print(find_alpha(last_word)) new_first_word_list = list(last_word) # print(new_first_word_list) new_last_word_list = list(first_word) # print(new_last_word_list) if l_end - l_start == 2: del new_first_word_list[l_end - 1] new_first_word_list[l_start] = first_alpha if f_end - f_start == 2: del new_last_word_list[f_end - 1] new_last_word_list[f_start] = last_alpha for i in range(1, len(list_of_word) - 1): middle = middle + list_of_word[i] full_word = ''.join(new_first_word_list) + middle full_word = full_word + ''.join(new_last_word_list) return full_word
def Chatbot(input_data): checkword = [] ## print('user_w',input_data) input_data = deepcut.tokenize( input_data) ##ตัดคำที่รับเข้ามาด้วย deepcut ##['สวัสดี'] if input_data[0] != "สวัสดี": for check in input_data: if check in word_to_int_input: #print('check',check) checkword.append(check) ## print('w',checkword) checkword = [word_to_int_input[word] for word in checkword] checkword = np.array([checkword]) checkword = sequence.pad_sequences(checkword, maxlen=40, padding='post') checkword = one_hot_encode(checkword, encoded_length) checkword = array(checkword) ##แก้ target = predict_sequence(infenc, infdec, checkword, 24, encoded_length) target = predict_sequence(infenc, infdec, checkword, 9, encoded_length) #print('target ',target) #print('bot',invert(target)) words = "" for word in invert(target): words = words + word input_data = "" return words else: words = "สวัสดี" return words print("bot_m = ", words)
def word_to_vec(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) documents = [ 'ฉันรักภาษาไทยเพราะฉันเป็นคนไทยและฉันเป็นคนไทย', 'ฉันเป็นนักเรียนที่ชื่นชอบวิทยาศาสตร์และเทคโนโลยี', 'ฉันไม่ใช่โปรแกรมเมอร์เพราะฉันทำมากกว่าคิดเขียนพัฒนาโปรแกรมทดสอบโปรแกรม', 'ฉันชื่นชอบวิทยาศาสตร์ชอบค้นคว้าตั้งสมมุติฐานและหาคำตอบ' ] texts = [list(deepcut.tokenize(i)) for i in documents] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save( '/Users/jirayutk./Project/SeniorProject/word2vec/tmp/deerwester.dict' ) # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize( '/Users/jirayutk./Project/SeniorProject/word2vec/tmp/deerwester.mm', corpus) # store to disk, for later use print(texts) pprint(corpus)
def prepare_data(): import deepcut import json input = open('input.txt', 'r', encoding='utf-8') ans = open('ans.txt', 'r', encoding='utf-8') input_token = [] for i in input: i = i.split('::')[1] i = i.replace('\n', '') input_token.append([deepcut.tokenize(i)]) n = 0 for i in ans: i = i.split('::')[1] i = i.replace('\n', '') if i == 'H': i = 0 elif i == 'P': i = 1 elif i == 'M': i = 2 input_token[n].insert(0, i) print(input_token[n]) n += 1 with open('data.json', 'w', encoding='utf-8') as file: json.dump(input_token, file, ensure_ascii=False)
def createBag(dataFrame): bagWord = [] for x in dataFrame['Header']: new_str = CleanText(x) """ Cut words """ bagWord.append(CutStopWord(deepcut.tokenize(new_str))) return bagWord
def Opencsv(opens): with open(f'./FileCSV1/{opens}_clean_translated.csv', encoding="utf8") as csvfile: reader = csv.reader(csvfile) next(reader, None) values = [] print("Procesing in pos_tag > ", opens) for row in reader: list_word = deepcut.tokenize(row[1]) test = str(list_word) clean = test.replace("'", "") clean2 = clean.replace(" ", "") clean3 = clean2.replace(",,", ",") i = tltk.nlp.pos_tag(clean3) test2 = str(i) clean3 = test2.replace("(',', 'PUNCT'),", "") clean4 = clean3.replace(", ('<s/>', 'PUNCT')", "") clean5 = clean4.replace("('[', 'SYM'),", "") clean6 = clean5.replace(", (']', 'SYM')", "") clean7 = clean6.replace(" ", "") clean8 = clean7.replace(", (',.]', 'ADV'),", "") values.append(clean8) return values
def countSentence(self, sentence, tagging): if self.language_name == 'en' or tagging: for word in sentence.split(' '): self.countWords(word) if self.language_name == 'th' and not tagging: for word in deepcut.tokenize(sentence): self.countWords(word)
def __init__(self, api_key=None, host_url=None, max_workers=1): """ Initial sixecho Attributes: api_key(string) - Optional : api_key generate from sixecho host_url(string) - Optional : is sixecho domain """ self.api_key = api_key deepcut.tokenize("Welcome") # Load library if host_url is not None: if host_url.endswith("/"): host_url = host_url[:-1] self.host_url = host_url self.array_words = [] self.min_hash = MinHash(num_perm=128) self.max_workers = max_workers self.sha256 = ""
def token(txt, txtnew): with open(txt, "r", encoding='utf-8') as f: with open(txtnew, "a", encoding='utf-8') as f1: for line in f: s = deepcut.tokenize(line) m = ' '.join(s) f1.write(m) f1.close() f.close()
def word_segment(text): """ word segmentation :param text: :return: text """ text = deepcut.tokenize(text, custom_dict="custom_dict/custom_dict.txt") text = "|".join(text) return text
def deepcuts(datas): list_word = [] for word in datas: word_cut = deepcut.tokenize(word) list_word.append(word_cut) numpy_join = np.concatenate(list_word) return list(numpy_join)
def upgraded_filter(message, counters): sample = deepcut.tokenize(message) for word in sample: for test_word in counters.keys(): if word == test_word: counters[test_word] += 1 result = any(filter(lambda x: x >= 2, counters.values())) return result
def get_deepcut_segmented(self): """ This function returns a clean string that is the output of applying deepcut on unsegmented version of line """ deepcut_out = deepcut.tokenize(self.unsegmented) out_line = "|" for word in deepcut_out: out_line += word + "|" return out_line
def main(model="original"): dest = "tokenised-with-%s-model.txt" % (model) with open(BEST_PATH, "r") as fr, \ open(dest, "w") as fw: lines = fr.readlines() for l in tqdm(lines): tokens = deepcut.tokenize(l.strip()) fw.write("%s\n" % "|".join(tokens)) print("Result is saved to %s" % dest)
def Query(str, dictionary, tf_idf): query_doc = [w for w in CutStopWord( deepcut.tokenize(CleanText(str)))] print(query_doc) query_doc_bow = dictionary.doc2bow(query_doc) print(query_doc_bow) query_doc_tf_idf = tf_idf[query_doc_bow] print(query_doc_tf_idf) return query_doc_tf_idf
def prepare_for_predict(input_questions): q_input = [] cleansing(input_questions) tokenized_input_1 = deepcut.tokenize(input_questions) for sentence in tokenized_input_1: q_input.append(sentence) q_input = word_index(tokenized_input_1) q_input = pad_sequences(q_input, maxlen=max_seq_length) return q_input
def tokenize(self, text_list): """ Tokenize Thai lyrics using deepcut """ import deepcut words = [] for lyric in tqdm(text_list): words.extend(deepcut.tokenize(lyric)) return words
def first(): #i เปน q แรก b = usinputcur() kamkorn = usinputoutcur() yol = deepcut.tokenize(kamkorn) i = yol[0] r = [x for x in range(10)] rat1 = i + " เป็นคำประเภท " + str(r[1]) rat2 = i + " เป็นคำประเภท " + str(r[2]) rat3 = i + " เป็นคำประเภท " + str(r[3]) rat4 = i + " เป็นคำประเภท " + str(r[4]) rat5 = i + " เป็นคำประเภท " + str(r[5]) rat6 = i + " เป็นคำประเภท " + str(r[6]) rat7 = i + " เป็นคำประเภท " + str(r[7]) rat8 = i + " เป็นคำประเภท " + str(r[8]) rat9 = i + " เป็นคำประเภท " + str(r[9]) if b == rat1: # เก็บค่า r ให้ i ในตาราง pocha bogprapet(i, r[1]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[1]) return n elif b == rat2: bogprapet(i, r[2]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[2]) return n elif b == rat3: bogprapet(i, r[3]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[3]) return n elif b == rat4: bogprapet(i, r[4]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[4]) return n elif b == rat5: bogprapet(i, r[5]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[5]) return n elif b == rat6: bogprapet(i, r[6]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[6]) return n elif b == rat7: bogprapet(i, r[7]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[7]) return n elif b == rat8: bogprapet(i, r[8]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[8]) return n elif b == rat9: bogprapet(i, r[9]) n = "ขอบคุณที่ให้ข้อมูลว่า " + i + " เป็นคำประเภท " + str(r[9]) return n else: return kwam()
def tokenize_thai(text): tokens = deepcut.tokenize(text) # return ' '.join(pieces) content_buff = "" for word in tokens: # print(word) # word1=re.sub(r'([0-9]+)[ ]([.|,])[ ]([0-9]+)', r'\1\2\3', word) content_buff = content_buff + " " + word content_buff = ' '.join(content_buff.split()) return( content_buff.strip() )
async def tokenize(websocket, path): try: while True: text = await websocket.recv() # TODO: Use queue to control maximum concurrency? toks = deepcut.tokenize(text) await websocket.send(json.dumps(toks)) except websockets.exceptions.ConnectionClosed as e: print("CLOSE")
def segment(text): return deepcut.tokenize(text)