def tokenize_text(input_filename, output_filename): if os.path.isfile(output_filename): logging.info( 'Skipping tokenize_text(). File already exists: {}'.format( output_filename)) return start = time.time() with open(output_filename, 'w', encoding='utf-8') as out: with open(input_filename, 'r', encoding='utf-8') as inp: for i, text in enumerate(inp.readlines()): if USE_MECAB_TOKENIZER: tokenized_text = ' '.join(get_words(text)) else: tokenized_text = ' '.join(tinysegmenter.tokenize(text)) out.write(tokenized_text + '\n') if i % 100 == 0 and i != 0: logging.info('Tokenized {} articles.'.format(i)) logging.info( 'Finished tokenize_text(). It took {0:.2f} s to execute.'.format( round(time.time() - start, 2)))
def japanese_tokenizer(sentence): tokens = tinysegmenter.tokenize(sentence.strip()) tokens = [t for t in tokens if t] words = [] for word in tokens: word_byte = tf.compat.as_bytes(word) words.extend(_WORD_SPLIT.split(word_byte)) return [w for w in words if w]
def to_romaji(text_jpn): text = ' '.join(tinysegmenter.tokenize(text_jpn)) kakasi.setMode("H", "a") # Hiragana ke romaji kakasi.setMode("K", "a") # Katakana ke romaji kakasi.setMode("J", "a") # Japanese ke romaji kakasi.setMode("r", "Hepburn") # default: Hepburn Roman table\ convert = (kakasi.getConverter()).do(text) return convert
def tokenize_text(input_filename, output_filename): if os.path.isfile(output_filename): return start = time.time() with open(output_filename, 'w') as out: with open(input_filename, 'r') as inp: for i, text in enumerate(inp.readlines()): tokenized_text = ' '.join(tinysegmenter.tokenize(text.decode('utf8'))) out.write(tokenized_text.encode('utf8')) if i % 100 == 0 and i != 0: logging.info('Tokenized {} articles'.format(i)) print('Finished tokenize_text(). It took {0:.2f} s to execute.'.format(round(time.time() - start, 2)))
def test_timemachine(tmpdir): with io.open('tests/timemachineu8j.txt', encoding='utf-8') as f: text = f.read() toks = tinysegmenter.tokenize(text) out = tmpdir.join("tokenized.txt") out.write_text(' | '.join(toks), encoding='utf-8') print(str(out)) # pytest show this only when test failed assert 0 == subprocess.call( ["diff", "-u", "tests/timemachineu8j.tokenized.txt", str(out)])
def test_timemachine(tmpdir): with io.open('../test/timemachineu8j.txt', encoding='utf-8') as f: text = f.read() toks = tinysegmenter.tokenize(text) out = tmpdir.join("tokenized.txt") out.write_text(' | '.join(toks), encoding='utf-8') print(str(out)) # pytest show this only when test failed assert 0 == subprocess.call([ "diff", "-u", "../test/timemachineu8j.tokenized.txt", str(out)])
def japanese_tokenizer(docs, MAX_NB_WORDS, max_seq_len): # tokenizing input data tokens = [] for doc in docs: tokens.append(tinysegmenter.tokenize(doc)) tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False) tokenizer.fit_on_texts(tokens) word_seq = tokenizer.texts_to_sequences(tokens) word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len) return word_seq, word_index
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: if self.strip: strip = text_type.strip else: def strip(s): return s pos = start_pos startchar = start_char for s, l in \ ((strip(s), len(s)) for s in tinysegmenter.tokenize(value)): t.text = s t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar startchar += l t.endchar = startchar yield t
def original_usage(text): """ Return the analysis results by tinysegmenter. Parameters ---------- text : str An input text Returns ------- tokens : list A list of words """ tokens = tinysegmenter.tokenize(text) return tokens
def tokenize(text): """ A method for word segmentation. Parameters ---------- text : str An input text Returns ------- words : list A list of words """ words = tinysegmenter.tokenize(text) return words
def segment(self, source, language=None): """Returns a chunk list from the given sentence. Args: source (str): Source string to segment. language (str, optional): A language code. Returns: A chunk list. (:obj:`budou.chunk.ChunkList`) Raises: ValueError: If :code:`language` is given and it is not included in :code:`supported_languages`. """ if language and not language in self.supported_languages: raise ValueError( 'Language {} is not supported by NLAPI segmenter'.format( language)) chunks = ChunkList() results = tinysegmenter.tokenize(source) seek = 0 for word in results: word = word.strip() if not word: continue if source[seek:seek + len(word)] != word: assert source[seek] == ' ' assert source[seek + 1:seek + len(word) + 1] == word chunks.append(Chunk.space()) seek += 1 dependency = None if word in _PARTICLES or word in _AUX_VERBS or is_hiragana(word): dependency = False chunk = Chunk(word, dependency=dependency) if chunk.is_punct(): chunk.dependency = chunk.is_open_punct() chunks.append(chunk) seek += len(word) chunks.resolve_dependencies() return chunks
def process_all_data(root_dir, out_dir): for lang in ['en', 'de', 'fr', 'ja']: for domain in ['books', 'dvd', 'music']: for split in ['train', 'test', 'unlabeled']: fn = os.path.join(root_dir, lang, domain, f'{split}.review') ofn = os.path.join(out_dir, lang, domain, f'{split}.tok.txt') with open(fn) as inf, open(ofn, 'w') as ouf: print(f"Processing file: {fn}") for review in parse(inf): # binarize label label = 1 if review['rating'] > 3 else 0 try: # remove line breaks raw_text = review['text'].replace('\n', ' ').replace( '\t', ' ') if lang == 'ja': tok_text = tinysegmenter.tokenize(raw_text) else: tok_text = nlp[lang].word_tokenize(raw_text) except: print("Exception tokenizing", review) continue print(f"{label}\t{' '.join(tok_text)}", file=ouf)
def test_sentence(self): correct = [('私', 1), ('の', 1), ('名前', 1), ('は', 1), ('中野', 1), ('です', 1)] self.assertCountEqual(correct, src.parse.get_frequency(ts.tokenize("私の名前は中野です")))
# coding=utf8 from rakutenma import RakutenMA import tinysegmenter from nltk import * import nltk import re #segmenter = tinysegmenter.TinySegmenter() result = tinysegmenter.tokenize( "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。") print('Segmenter: ') print(result) # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already rma = RakutenMA() # Let's analyze a sample sentence (from http://tatoeba.org/jpn/sentences/show/103809) # With a disastrous result, since the model is empty! print('Result') print(rma.tokenize(result)) print('Original') print(rma.tokenize("米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。")) print('------------------') # print(rma.tokenize("子どものみなさん、ゆるしてください。ぼくはこの本をひとりのおとなのひとにささげます。でもちゃんとしたわけがあるのです。")) # print(rma.tokenizetwo("彼は新しい仕事できっと成功するだろう。")) # print(rma.tokenize("彼は新しい仕事できっと成功するだろう。")) # Feed the model with ten sample sentences from tatoeba.com # "tatoeba.json" is available at https://github.com/rakuten-nlp/rakutenma
def printDefs(outputs, seconds): if not hasattr(printDefs, "counter"): printDefs.counter = 0 if not hasattr(printDefs, "tokenization_counter"): printDefs.tokenization_counter = 0 tokens = set() with open("tout3.txt") as lines: for line in lines: #tinysegmenter tinysegmenter_tokens = tinysegmenter.tokenize(line.strip()) #nagisa nagisa_tokens = nagisa.tagging(line.strip()).words if printDefs.tokenization_counter % 2 == 0: print("tinysegmenter:") tokenized_statement = tinysegmenter_tokens else: print("nagisa:") tokenized_statement = nagisa_tokens print(tokenized_statement) printDefs.tokenization_counter += 1 for token in tokenized_statement: t = token.strip() if t not in banlist and t != "": tokens.add(t) if len(tokens) == 0: time.sleep(2) translated = [] for token in tokens: try: definition = subprocess.check_output(["myougiden", "-f", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore") if len(outputs) == 0: print("") print("----------------------------------------------------------------------------") print("") print(token) print(definition) else: print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)]) f = open(outputs[printDefs.counter % len(outputs)], "a") f.write("\n\n\n---------------------------------------------------\n\n\n\n") f.write(token) f.write("\n") f.write(definition) f.close() printDefs.counter += 1 time.sleep(seconds) except: try: definition = subprocess.check_output(["myougiden", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore") if len(outputs) == 0: print("") print("----------------------------------------------------------------------------") print("") print(token) print(definition) else: print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)]) f = open(outputs[printDefs.counter % len(outputs)], "a") f.write("\n\n\n---------------------------------------------------\n\n\n\n") f.write(token) f.write("\n") f.write(definition) f.close() printDefs.counter += 1 time.sleep(seconds) except: time.sleep(0.01) #print(token + " not found in dictionary") return
def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in tinysegmenter.tokenize(tokens)]#nltk.word_tokenize(tokens)]
def _(bm): for _ in bm: tokenize(text)
# coding: utf-8 # # Usage: python3 benchmark.py # # `pip install -r requirements.txt` is required. from __future__ import unicode_literals import io import sys if sys.version_info[0] > 2: xrange = range from tinysegmenter import tokenize from benchmarker import Benchmarker loop = 10 tokenize('ウォームアップするぞ') with io.open('timemachineu8j.txt', encoding='utf-8') as f: text = f.read() with Benchmarker(loop, width=20) as bench: @bench("tokenize") def _(bm): for _ in bm: tokenize(text)
# coding: utf-8 # # Usage: python3 benchmark.py # # `pip install -r requirements.txt` is required. from __future__ import unicode_literals import io import sys if sys.version_info[0] > 2: xrange = range from tinysegmenter import tokenize from benchmarker import Benchmarker loop = 100 tokenize('ウォームアップするぞ') with io.open('timemachineu8j.txt', encoding='utf-8') as f: text = f.read() with Benchmarker(loop, width=20) as bench: @bench("tokenize") def _(bm): for _ in bm: tokenize(text)
n5words.add(word.strip()) print("Loading n4 list") with open("n4.list") as n4: for word in n4: n4words.add(word.strip()) occurrences = {} sentences = {} print("Counting words") with codecs.open(sys.argv[1], 'r', encoding='utf-8', errors='ignore') as infile: for line in infile: #print(line) tokenizedLine = tinysegmenter.tokenize(line.strip()) + nagisa.tagging( line.strip()).words for token in tokenizedLine: if token not in banlist and token not in n5words and token not in n4words and token.strip( ) != "": if token in occurrences.keys(): occurrences[token] += 1 else: occurrences[token] = 1 if token not in sentences.keys(): sentences[token] = line.replace("\n", "").replace("|", "!") print("Sorting words") sortedOccurrences = [] for character in occurrences.keys(): sortedOccurrences.append([character, occurrences[character]])
import tinysegmenter statement = 'アリムタ投与中非ステロイド性抗炎症剤(NSAIDs使用' tokenized_statement = tinysegmenter.tokenize(statement) print(tokenized_statement)