def wrapper_jp(string, width): """Japanese string with newline wrapping function""" segmenter = TinySegmenter() tokens = segmenter.tokenize(string) token_remain = lambda: len(tokens) > 0 # save lines shorter than width into result result = "" while token_remain(): line = "" # accumulate tokens whose total is shorter than width into line while token_remain() and len(line + tokens[0]) <= width: line += tokens.pop(0) else: result += line + ('\n' if token_remain() else '') # print(result) return result
def get_vocab_list(corpus, encoding): segmenter = TinySegmenter() vectorizer = TfidfVectorizer(tokenizer=segmenter.tokenize) X = vectorizer.fit_transform(file_contents_generator(corpus, encoding)) return [ feature.strip() for feature in vectorizer.get_feature_names() if feature.strip() ]
def on_status(self, status): flg = 0 for buff in exception_ids: if (status.author.screen_name == buff) : flg = 1 if not hasattr(status, 'retweeted_status') and flg != 1: try: print u'\n---{name}/@{screen}---\n {text}\nvia {src} {created}'.format( name = status.author.name, screen = status.author.screen_name, text = status.text.replace('&','&'), src = status.source, created = status.created_at) read_text = str_replace(status.author.name.decode('utf-8')) + 'さん ' + str_replace(status.text.decode('utf-8')) ts = TinySegmenter() result = ts.tokenize(read_text) string_jp = '' string_en = '' for seg in result: seg = re.sub('^\s+', '', seg) if (re.match(u'(?:[^\u0000-\u007F]|[\d+]|^[A-Za-rt-z]{1}$)', seg)) and not re.match(u'^[aA]$', seg) :#日本語が含まれる call(['echo "{text}" | say -v Victoria -r 200 >/dev/null 2>&1'.format(text=string_en)], shell=True) string_en = '' string_jp = string_jp + seg else : call(['SayKotoeri2 -s 110 "{text}" >/dev/null 2>&1'.format(text=string_jp)], shell=True) string_jp = '' string_en = string_en + ' ' + seg if(string_jp) : call(['SayKotoeri2 -s 110 "{text}" >/dev/null 2>&1'.format(text=string_jp)], shell=True) else : call(['echo "{text}" | say -v Victoria -r 200 >/dev/null 2>&1'.format(text=string_en)], shell=True) except Exception, e: print >> sys.stderr, 'Encountered Exception:', e pass
def averageNumberOfTokens(self, entries, eastern=True): '''Finds the average number of words in a sentence.''' t0 = time() entries_count = len(entries) wordcount = 0 for entry in entries: if eastern: wordcount += len(TinySegmenter().tokenize(entry)) else: wordcount += len(entry.split()) print("Took %s seconds to return the avg. # of tokens per entry." % (time() - t0)) print(float(wordcount) / entries_count) return float(wordcount) / entries_count
def parse(self): raw_chunks = self.raw_data.split('\n\n') parsed_chunks = [] for chunk in raw_chunks[1:]: chunk_lines = chunk.split('\n') if len(chunk_lines[0]) == 0: continue time_range_parts = chunk_lines[0].split(',') if ':' in time_range_parts: start = get_sec(time_range_parts[0]) end = get_sec(time_range_parts[1]) else: start = float(time_range_parts[0]) end = float(time_range_parts[1]) print('parsing chunk...') chunk_line = ''.join(chunk_lines[1:]) # split lines into words tokens = TinySegmenter().tokenize(chunk_line) # clean up whitespace, re-join using a single space, and push into original_lines original = ' '.join([token.strip() for token in tokens]) # invert Kanji into Hiragana, re-join using a single space, and push into original lines str_inverted_tokens = Kakasi().invert(' '.join(tokens)) inverted = str_inverted_tokens # translate definitions = [{ 'word': token, 'senses': Dictionary().lookup(token), 'particle': is_particle(token) } for token in str_inverted_tokens.split(' ')] parsed_chunks.append({ 'start': start, 'end': end, 'original': original, 'inverted': inverted, 'definitions': definitions }) return parsed_chunks
def demo(): segmenter = TinySegmenter() print(u' | '.join(segmenter.tokenize(u"私の名前は中野です")).encode('utf-8'))
""" CorpusEnactor.Echoクラス """ from __future__ import unicode_literals from __future__ import print_function import os import sys import yaml import codecs import pickle from collections import Counter import numpy as np from tinysegmenter import TinySegmenter Segmenter = TinySegmenter() TFIDF_CACHE = "cache/tfidf.npz" FEAT_CACHE = "cache/feat.pickle" class Echo: """ テキスト検索手法を用いた基本的なチャットボット チャットボットでよく用いられる応答方法の一つとして、ユーザの入力に似た文をログの中で検索し、 最も似た文の次の行を返答として返す、というアルゴリズムがある。この動作の狙いは 「ログ(またはコーパス)を再演する」 ことである。CorpusEnactor.Echoクラスではユーザの入力文字列に似ている行を見つける最も オーソドックスな計算方法であるtfidf-cos類似度を用いた実装を行う。
import keras import numpy as np import json from tinysegmenter import TinySegmenter data_path = 'copus.txt' num_samples = 100 num_epochs = 100 batch_size = 256 tokenize = TinySegmenter().tokenize def preprocess_sentence(w): w = w.lower() tokens = tokenize(w) w = "<start> " for word in tokens: if word == " ": continue w += word + " " w += "<end>" return w input_texts = [] target_texts = [] input_words = set() target_words = set() with open(data_path, 'r', encoding='utf-8') as f: lines = f.read().split('\n')
def get_tiny_segmenter(self): """get japan vectorizer""" from tinysegmenter import TinySegmenter return TinySegmenter().tokenize
def get_tiny_segmenter(self): """get japan vectorizer""" return TinySegmenter().tokenize