Esempio n. 1
0
def tokenize_text(input_filename, output_filename):

    if os.path.isfile(output_filename):
        logging.info(
            'Skipping tokenize_text(). File already exists: {}'.format(
                output_filename))
        return

    start = time.time()

    with open(output_filename, 'w', encoding='utf-8') as out:
        with open(input_filename, 'r', encoding='utf-8') as inp:

            for i, text in enumerate(inp.readlines()):

                if USE_MECAB_TOKENIZER:
                    tokenized_text = ' '.join(get_words(text))
                else:
                    tokenized_text = ' '.join(tinysegmenter.tokenize(text))

                out.write(tokenized_text + '\n')

                if i % 100 == 0 and i != 0:
                    logging.info('Tokenized {} articles.'.format(i))
    logging.info(
        'Finished tokenize_text(). It took {0:.2f} s to execute.'.format(
            round(time.time() - start, 2)))
def japanese_tokenizer(sentence):
    tokens = tinysegmenter.tokenize(sentence.strip())
    tokens = [t for t in tokens if t]
    words = []
    for word in tokens:
        word_byte = tf.compat.as_bytes(word)
        words.extend(_WORD_SPLIT.split(word_byte))
    return [w for w in words if w]
Esempio n. 3
0
def to_romaji(text_jpn):
    text = ' '.join(tinysegmenter.tokenize(text_jpn))
    kakasi.setMode("H", "a")  # Hiragana ke romaji
    kakasi.setMode("K", "a")  # Katakana ke romaji
    kakasi.setMode("J", "a")  # Japanese ke romaji
    kakasi.setMode("r", "Hepburn")  # default: Hepburn Roman table\
    convert = (kakasi.getConverter()).do(text)
    return convert
def tokenize_text(input_filename, output_filename):
    if os.path.isfile(output_filename):
        return
    start = time.time()
    with open(output_filename, 'w') as out:
        with open(input_filename, 'r') as inp:
            for i, text in enumerate(inp.readlines()):
                tokenized_text = ' '.join(tinysegmenter.tokenize(text.decode('utf8')))
                out.write(tokenized_text.encode('utf8'))
                if i % 100 == 0 and i != 0:
                    logging.info('Tokenized {} articles'.format(i))
    print('Finished tokenize_text(). It took {0:.2f} s to execute.'.format(round(time.time() - start, 2)))
Esempio n. 5
0
def test_timemachine(tmpdir):
    with io.open('tests/timemachineu8j.txt', encoding='utf-8') as f:
        text = f.read()

    toks = tinysegmenter.tokenize(text)

    out = tmpdir.join("tokenized.txt")
    out.write_text(' | '.join(toks), encoding='utf-8')

    print(str(out))  # pytest show this only when test failed
    assert 0 == subprocess.call(
        ["diff", "-u", "tests/timemachineu8j.tokenized.txt",
         str(out)])
def test_timemachine(tmpdir):
    with io.open('../test/timemachineu8j.txt', encoding='utf-8') as f:
        text = f.read()

    toks = tinysegmenter.tokenize(text)

    out = tmpdir.join("tokenized.txt")
    out.write_text(' | '.join(toks), encoding='utf-8')

    print(str(out))  # pytest show this only when test failed
    assert 0 == subprocess.call([
            "diff", "-u",
            "../test/timemachineu8j.tokenized.txt",
            str(out)])
Esempio n. 7
0
def japanese_tokenizer(docs, MAX_NB_WORDS, max_seq_len):
    # tokenizing input data
    tokens = []
    for doc in docs:
        tokens.append(tinysegmenter.tokenize(doc))

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False)
    tokenizer.fit_on_texts(tokens)
    word_seq = tokenizer.texts_to_sequences(tokens)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len)

    return word_seq, word_index
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:

                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
Esempio n. 9
0
def original_usage(text):
    """
    Return the analysis results by tinysegmenter.

    Parameters
    ----------
    text : str
        An input text

    Returns
    -------
    tokens : list
        A list of words
    """
    tokens = tinysegmenter.tokenize(text)
    return tokens
Esempio n. 10
0
def tokenize(text):
    """
    A method for word segmentation.

    Parameters
    ----------
    text : str
        An input text

    Returns
    -------
    words : list
        A list of words
    """

    words = tinysegmenter.tokenize(text)
    return words
Esempio n. 11
0
    def segment(self, source, language=None):
        """Returns a chunk list from the given sentence.

    Args:
      source (str): Source string to segment.
      language (str, optional): A language code.

    Returns:
      A chunk list. (:obj:`budou.chunk.ChunkList`)

    Raises:
      ValueError: If :code:`language` is given and it is not included in
                  :code:`supported_languages`.
    """
        if language and not language in self.supported_languages:
            raise ValueError(
                'Language {} is not supported by NLAPI segmenter'.format(
                    language))

        chunks = ChunkList()
        results = tinysegmenter.tokenize(source)
        seek = 0
        for word in results:
            word = word.strip()
            if not word:
                continue
            if source[seek:seek + len(word)] != word:
                assert source[seek] == ' '
                assert source[seek + 1:seek + len(word) + 1] == word
                chunks.append(Chunk.space())
                seek += 1

            dependency = None
            if word in _PARTICLES or word in _AUX_VERBS or is_hiragana(word):
                dependency = False

            chunk = Chunk(word, dependency=dependency)
            if chunk.is_punct():
                chunk.dependency = chunk.is_open_punct()
            chunks.append(chunk)
            seek += len(word)
        chunks.resolve_dependencies()
        return chunks
    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0,
                 tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:
                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
def process_all_data(root_dir, out_dir):
    for lang in ['en', 'de', 'fr', 'ja']:
        for domain in ['books', 'dvd', 'music']:
            for split in ['train', 'test', 'unlabeled']:
                fn = os.path.join(root_dir, lang, domain, f'{split}.review')
                ofn = os.path.join(out_dir, lang, domain, f'{split}.tok.txt')
                with open(fn) as inf, open(ofn, 'w') as ouf:
                    print(f"Processing file: {fn}")
                    for review in parse(inf):
                        # binarize label
                        label = 1 if review['rating'] > 3 else 0
                        try:
                            # remove line breaks
                            raw_text = review['text'].replace('\n',
                                                              ' ').replace(
                                                                  '\t', ' ')
                            if lang == 'ja':
                                tok_text = tinysegmenter.tokenize(raw_text)
                            else:
                                tok_text = nlp[lang].word_tokenize(raw_text)
                        except:
                            print("Exception tokenizing", review)
                            continue
                        print(f"{label}\t{' '.join(tok_text)}", file=ouf)
Esempio n. 14
0
 def test_sentence(self):
     correct = [('私', 1), ('の', 1), ('名前', 1), ('は', 1), ('中野', 1), ('です', 1)]
     self.assertCountEqual(correct, src.parse.get_frequency(ts.tokenize("私の名前は中野です")))
# coding=utf8
from rakutenma import RakutenMA
import tinysegmenter
from nltk import *
import nltk
import re

#segmenter = tinysegmenter.TinySegmenter()
result = tinysegmenter.tokenize(
    "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。")
print('Segmenter: ')
print(result)

# Initialize a RakutenMA instance with an empty model
# the default ja feature set is set already
rma = RakutenMA()

# Let's analyze a sample sentence (from http://tatoeba.org/jpn/sentences/show/103809)
# With a disastrous result, since the model is empty!
print('Result')
print(rma.tokenize(result))
print('Original')
print(rma.tokenize("米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。"))
print('------------------')
# print(rma.tokenize("子どものみなさん、ゆるしてください。ぼくはこの本をひとりのおとなのひとにささげます。でもちゃんとしたわけがあるのです。"))
# print(rma.tokenizetwo("彼は新しい仕事できっと成功するだろう。"))
# print(rma.tokenize("彼は新しい仕事できっと成功するだろう。"))

# Feed the model with ten sample sentences from tatoeba.com
# "tatoeba.json" is available at https://github.com/rakuten-nlp/rakutenma
Esempio n. 16
0
def printDefs(outputs, seconds):
    if not hasattr(printDefs, "counter"):
        printDefs.counter = 0
    if not hasattr(printDefs, "tokenization_counter"):
        printDefs.tokenization_counter = 0
    tokens = set()
    with open("tout3.txt") as lines:
        for line in lines:
            #tinysegmenter
            tinysegmenter_tokens = tinysegmenter.tokenize(line.strip())
            #nagisa
            nagisa_tokens = nagisa.tagging(line.strip()).words
            if printDefs.tokenization_counter % 2 == 0:
                print("tinysegmenter:")
                tokenized_statement = tinysegmenter_tokens
            else:
                print("nagisa:")
                tokenized_statement = nagisa_tokens
            print(tokenized_statement)
            printDefs.tokenization_counter += 1
            for token in tokenized_statement:
                t = token.strip()
                if t not in banlist and t != "":
                    tokens.add(t)

    if len(tokens) == 0:
        time.sleep(2)
    translated = []
    for token in tokens:
        try:
            definition = subprocess.check_output(["myougiden", "-f", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore")
            if len(outputs) == 0:
                print("")
                print("----------------------------------------------------------------------------")
                print("")
                print(token)
                print(definition)
            else:
                print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)])
                f = open(outputs[printDefs.counter % len(outputs)], "a")
                f.write("\n\n\n---------------------------------------------------\n\n\n\n")
                f.write(token)
                f.write("\n")
                f.write(definition)
                f.close()
                printDefs.counter += 1
            time.sleep(seconds)
        except:
            try:
                definition = subprocess.check_output(["myougiden", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore")
                if len(outputs) == 0:
                    print("")
                    print("----------------------------------------------------------------------------")
                    print("")
                    print(token)
                    print(definition)
                else:
                    print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)])
                    f = open(outputs[printDefs.counter % len(outputs)], "a")
                    f.write("\n\n\n---------------------------------------------------\n\n\n\n")
                    f.write(token)
                    f.write("\n")
                    f.write(definition)
                    f.close()
                    printDefs.counter += 1
                time.sleep(seconds)
            except:
                time.sleep(0.01)
            #print(token + " not found in dictionary")
    return
Esempio n. 17
0
def word_tokenize(tokens):
    return [token.replace("''", '"').replace("``", '"') for token in tinysegmenter.tokenize(tokens)]#nltk.word_tokenize(tokens)]
Esempio n. 18
0
 def _(bm):
     for _ in bm:
         tokenize(text)
Esempio n. 19
0
# coding: utf-8
#
# Usage: python3 benchmark.py
#
# `pip install -r requirements.txt` is required.
from __future__ import unicode_literals
import io
import sys

if sys.version_info[0] > 2:
    xrange = range

from tinysegmenter import tokenize
from benchmarker import Benchmarker

loop = 10
tokenize('ウォームアップするぞ')

with io.open('timemachineu8j.txt', encoding='utf-8') as f:
    text = f.read()

with Benchmarker(loop, width=20) as bench:

    @bench("tokenize")
    def _(bm):
        for _ in bm:
            tokenize(text)
# coding: utf-8
#
# Usage: python3 benchmark.py
#
# `pip install -r requirements.txt` is required.
from __future__ import unicode_literals
import io
import sys

if sys.version_info[0] > 2:
    xrange = range

from tinysegmenter import tokenize
from benchmarker import Benchmarker


loop = 100
tokenize('ウォームアップするぞ')

with io.open('timemachineu8j.txt', encoding='utf-8') as f:
    text = f.read()

with Benchmarker(loop, width=20) as bench:
    @bench("tokenize")
    def _(bm):
        for _ in bm:
            tokenize(text)
 def _(bm):
     for _ in bm:
         tokenize(text)
Esempio n. 22
0
        n5words.add(word.strip())

print("Loading n4 list")
with open("n4.list") as n4:
    for word in n4:
        n4words.add(word.strip())

occurrences = {}
sentences = {}

print("Counting words")
with codecs.open(sys.argv[1], 'r', encoding='utf-8',
                 errors='ignore') as infile:
    for line in infile:
        #print(line)
        tokenizedLine = tinysegmenter.tokenize(line.strip()) + nagisa.tagging(
            line.strip()).words
        for token in tokenizedLine:
            if token not in banlist and token not in n5words and token not in n4words and token.strip(
            ) != "":
                if token in occurrences.keys():
                    occurrences[token] += 1
                else:
                    occurrences[token] = 1
                if token not in sentences.keys():
                    sentences[token] = line.replace("\n", "").replace("|", "!")

print("Sorting words")
sortedOccurrences = []
for character in occurrences.keys():
    sortedOccurrences.append([character, occurrences[character]])
Esempio n. 23
0
import tinysegmenter

statement = 'アリムタ投与中非ステロイド性抗炎症剤(NSAIDs使用'
tokenized_statement = tinysegmenter.tokenize(statement)
print(tokenized_statement)