Example #1
0
def wrapper_jp(string, width):
    """Japanese string with newline wrapping function"""
    segmenter = TinySegmenter()
    tokens = segmenter.tokenize(string)
    token_remain = lambda: len(tokens) > 0
    # save lines shorter than width into result
    result = ""
    while token_remain():
        line = ""
        # accumulate tokens whose total is shorter than width into line
        while token_remain() and len(line + tokens[0]) <= width:
            line += tokens.pop(0)
        else:
            result += line + ('\n' if token_remain() else '')
    # print(result)
    return result
Example #2
0
def get_vocab_list(corpus, encoding):
    segmenter = TinySegmenter()
    vectorizer = TfidfVectorizer(tokenizer=segmenter.tokenize)
    X = vectorizer.fit_transform(file_contents_generator(corpus, encoding))
    return [
        feature.strip() for feature in vectorizer.get_feature_names()
        if feature.strip()
    ]
Example #3
0
    def on_status(self, status):
        
        flg = 0
        for buff in exception_ids:
            if (status.author.screen_name == buff) : flg = 1
        
        if not hasattr(status, 'retweeted_status') and flg != 1:
            try:
                print u'\n---{name}/@{screen}---\n   {text}\nvia {src} {created}'.format(
                        name = status.author.name,
                        screen = status.author.screen_name,
                        text = status.text.replace('&amp;','&'),
                        src = status.source,
                        created = status.created_at)
                read_text = str_replace(status.author.name.decode('utf-8')) + 'さん ' + str_replace(status.text.decode('utf-8'))
            
                ts = TinySegmenter()
                result = ts.tokenize(read_text)
                string_jp = ''
                string_en = ''
                for seg in result:
                    seg = re.sub('^\s+', '', seg)
                    if (re.match(u'(?:[^\u0000-\u007F]|[\d+]|^[A-Za-rt-z]{1}$)', seg)) and not re.match(u'^[aA]$', seg) :#日本語が含まれる
                        call(['echo "{text}" | say -v Victoria -r 200 >/dev/null 2>&1'.format(text=string_en)], shell=True)
                        string_en = ''
                        string_jp = string_jp + seg
                    else :
                        call(['SayKotoeri2 -s 110 "{text}" >/dev/null 2>&1'.format(text=string_jp)], shell=True)
                        string_jp = ''
                        string_en = string_en + ' ' + seg

                if(string_jp) :
                    call(['SayKotoeri2 -s 110 "{text}" >/dev/null 2>&1'.format(text=string_jp)], shell=True)
                else :
                    call(['echo "{text}" | say -v Victoria -r 200 >/dev/null 2>&1'.format(text=string_en)], shell=True)

            except Exception, e:
                print >> sys.stderr, 'Encountered Exception:', e
                pass
Example #4
0
 def averageNumberOfTokens(self, entries, eastern=True):
     '''Finds the average number of words in a sentence.'''
     t0 = time()
     entries_count = len(entries)
     wordcount = 0
     for entry in entries:
         if eastern:
             wordcount += len(TinySegmenter().tokenize(entry))
         else:
             wordcount += len(entry.split())
     print("Took %s seconds to return the avg. # of tokens per entry." %
           (time() - t0))
     print(float(wordcount) / entries_count)
     return float(wordcount) / entries_count
Example #5
0
    def parse(self):

        raw_chunks = self.raw_data.split('\n\n')
        parsed_chunks = []

        for chunk in raw_chunks[1:]:
            chunk_lines = chunk.split('\n')

            if len(chunk_lines[0]) == 0:
                continue

            time_range_parts = chunk_lines[0].split(',')

            if ':' in time_range_parts:
                start = get_sec(time_range_parts[0])
                end = get_sec(time_range_parts[1])
            else:
                start = float(time_range_parts[0])
                end = float(time_range_parts[1])

            print('parsing chunk...')
            chunk_line = ''.join(chunk_lines[1:])

            # split lines into words
            tokens = TinySegmenter().tokenize(chunk_line)

            # clean up whitespace, re-join using a single space, and push into original_lines
            original = ' '.join([token.strip() for token in tokens])

            # invert Kanji into Hiragana, re-join using a single space, and push into original lines
            str_inverted_tokens = Kakasi().invert(' '.join(tokens))
            inverted = str_inverted_tokens

            # translate
            definitions = [{
                'word': token,
                'senses': Dictionary().lookup(token),
                'particle': is_particle(token)
            } for token in str_inverted_tokens.split(' ')]

            parsed_chunks.append({
                'start': start,
                'end': end,
                'original': original,
                'inverted': inverted,
                'definitions': definitions
            })

        return parsed_chunks
def demo():
    segmenter = TinySegmenter()
    print(u' | '.join(segmenter.tokenize(u"私の名前は中野です")).encode('utf-8'))
Example #7
0
"""
CorpusEnactor.Echoクラス
"""
from __future__ import unicode_literals
from __future__ import print_function

import os
import sys
import yaml
import codecs
import pickle
from collections import Counter
import numpy as np

from tinysegmenter import TinySegmenter
Segmenter = TinySegmenter()

TFIDF_CACHE = "cache/tfidf.npz"
FEAT_CACHE = "cache/feat.pickle"


class Echo:
    """
    テキスト検索手法を用いた基本的なチャットボット

    チャットボットでよく用いられる応答方法の一つとして、ユーザの入力に似た文をログの中で検索し、
    最も似た文の次の行を返答として返す、というアルゴリズムがある。この動作の狙いは
    「ログ(またはコーパス)を再演する」
    ことである。CorpusEnactor.Echoクラスではユーザの入力文字列に似ている行を見つける最も
    オーソドックスな計算方法であるtfidf-cos類似度を用いた実装を行う。
Example #8
0
import keras
import numpy as np
import json
from tinysegmenter import TinySegmenter

data_path = 'copus.txt'
num_samples = 100
num_epochs = 100
batch_size = 256

tokenize = TinySegmenter().tokenize


def preprocess_sentence(w):
    w = w.lower()
    tokens = tokenize(w)
    w = "<start> "
    for word in tokens:
        if word == " ":
            continue
        w += word + " "
    w += "<end>"
    return w


input_texts = []
target_texts = []
input_words = set()
target_words = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
def demo():
    segmenter = TinySegmenter()
    print(u' | '.join(segmenter.tokenize(u"私の名前は中野です")).encode('utf-8'))
Example #10
0
 def get_tiny_segmenter(self):
     """get japan vectorizer"""
     from tinysegmenter import TinySegmenter
     return TinySegmenter().tokenize
Example #11
0
 def get_tiny_segmenter(self):
     """get japan vectorizer"""
     return TinySegmenter().tokenize