Beispiel #1
0
 def __init__(self, mode: int = PlayMode.HIRA_ZHUYIN):
     self.mecab = MeCab.Tagger()  # -Oyomi -Owakati
     self.Mode = mode
Beispiel #2
0
import linecache
import random
import MeCab


def get_byte_num(s):
    return len(s.encode('utf-8'))


if __name__ == '__main__':
    random.seed(42)
    filename = 'tmp.txt'
    save_file = 'ja.text8'
    # LIMIT_BYTES = 100000000
    t = MeCab.Tagger('-Owakati')
    num_lines = sum(1 for line in open(filename))
    indices = list(range(num_lines))
    random.shuffle(indices)

    with open(save_file, 'w') as f:
        count_byte = 0
        for i in indices:
            print('{} bytes'.format(count_byte))
            text = linecache.getline(filename, i)
            text = text.strip()
            text = t.parse(text).strip()
            f.write(text)
            count_byte += get_byte_num(text)
            # if count_byte >= LIMIT_BYTES:
            #     break
Beispiel #3
0
"""
韩语分词
"""
import MeCab
mecab = MeCab.Tagger("-Owakati")
with open("/Users/ff/Desktop/测评数据/去空格/ko_chosun_test.txt",
          'r',
          encoding='utf-8') as f_in:
    with open("/Users/ff/Desktop/测评数据/去空格/ko_chosun_test_split.txt",
              'w',
              encoding='utf-8') as f_out:
        for sentence in f_in:
            mecab.parse(sentence)
            # print(mecab.parse(sentence))
            f_out.write(str(mecab.parse(sentence)).strip())
            f_out.write('\n')
print("Finish line")

# from janome.tokenizer import Tokenizer as janome_tokenizer
#
# with open("/Users/ff/Desktop/train_data/jp/jp_web.txt", 'r', encoding='utf-8') as f_in:
#     with open("/Users/ff/Desktop/train_data/jp/jo_web_split_token3.txt", 'w', encoding='utf-8') as f_out:
#         for sentence in f_in:
#             # sentence = "日本人のものと見られる、延べ2億件のメールアドレスとパスワードが闇サイトで販売されていたことがわかりました。過去に漏えいしたデータを集めたものと見られ、調査に当たったセキュリティー企業は、日本を狙ったサイバー攻撃のきっかけになるおそれがあるとして注意を呼びかけています。"
#             token_object = janome_tokenizer()
#             alist = [x.surface for x in token_object.tokenize(sentence)]
#             print(" ".join(alist))
#             f_out.write(" ".join(alist).strip())
#             f_out.write('\n')
def build_vocabulary_per_proc(sentence_files, total_proc, vocab_dict_file,
                              proc_no):

    # NGワード(STOPワード)をファイルから読み込む。
    NG_WORD_LIST = []
    with open(G.NG_WORD_LIST, mode='rt') as f:
        NG_WORD_LIST = list(f)

    tmp_sentence_files = []
    dict_file_no = 0
    mecab = MeCab.Tagger('-d /usr/lib/mecab/dic/mecab-ipadic-neologd')
    mecab.parse('')

    # センテンスファイル数が総プロセス数より少ない場合、総プロセス数をセンテンスファイル数と同値とする。
    if len(sentence_files) < total_proc:
        total_proc = len(sentence_files)

    processed_files = 0

    # 文書ファイルのパスをプロセス数分だけ読み込み、自分のプロセス番号に該当するファイルを処理する。
    for tmp_sentence_file in sentence_files:
        tmp_sentence_files.append(tmp_sentence_file)
        # プロセス数分のファイルパスを読み込んだら、自分のプロセス番号のファイルを処理する。
        if len(tmp_sentence_files) == total_proc or (
                len(sentence_files) - processed_files) < total_proc:
            processed_files += total_proc

            if len(tmp_sentence_files) < (proc_no) + 1:
                break

            # 処理対象の文書ファイルパスを取得し、その分かち書きした結果を格納するファイルパスを作成する。
            sentence_file = tmp_sentence_files[proc_no]
            sentence_wakati_file = wakati_files_dir + os.path.basename(
                sentence_file) + '_wakati'

            tmp_sentence_files = []
            vocabulary = dict()

            print("proc_no:{} file:{}".format(proc_no, sentence_file))

            with open(sentence_wakati_file, mode='w') as swf:
                sentences = Sentences(sentence_file)

                print("Generating Vocabulary from the sentences")

                train_words = 0
                sentence_procs = []
                counter = 0
                for sentence in sentences:
                    wakati_line = []
                    counter += 1
                    if counter % 100000 == 0:
                        print("proc:{}  counter:{}".format(proc_no, counter))

                    node = mecab.parseToNode(sentence)

                    while node:
                        word = node.surface
                        wakati_line.append(word)

                        # NGワードはスキップする。
                        if word in NG_WORD_LIST:
                            node = node.next
                            continue

                        # min_charで指定された文字数未満の場合スキップする。
                        if len(word) < G.min_char:
                            node = node.next
                            continue

                        # 一文字の数字・アルファベットはスキップする。
                        if len(word) == 1 and re.match('[a-xA-Z0-9]', word):
                            node = node.next
                            continue

                        pos1 = node.feature.split(',')[0]
                        pos2 = node.feature.split(',')[1]

                        #数以外の名詞、最小文字数を超える動詞形容詞は学習する。
                        if (pos1 == '名詞' and pos2 != '数' and pos2 != '非自立') or \
                           (pos1 == '動詞' and pos2 == '自立' and len(word) > G.min_char) or \
                           (pos1 == '形容詞' and len(word) > G.min_char):
                            vocabulary.setdefault(word, 0)
                            vocabulary[word] += 1
                            train_words += 1

                        node = node.next
                    swf.write(' '.join(wakati_line) + '\n')

                print("Vocabulary size = %d" % len(vocabulary))
                print("Total words to be trained = %d" % train_words)

                with open(vocab_dict_file + '_' + str(dict_file_no),
                          'wb') as f:
                    pickle.dump(vocabulary, f)

                dict_file_no += 1
Beispiel #5
0
        # return df[df.group_id==row.group_id].loc[:,['group_id', 'domain', 'surface']]


# print(search_synonyms('巨人'))
# print(search_synonyms('Amazon'))
##########################################################
#########################################################
#  BoWとDoc2Vecの結果の比較をしてみる
#  https://qiita.com/kaki_1900/items/474bf00c0720af1ff1bf
from os.path import normpath, dirname, join
import os
import MeCab
import unicodedata
import neologdn

tagger = MeCab.Tagger()


def tokenize(text):
    text = unicodedata.normalize('NFKC', text)  # <1>unicode正規化
    text = neologdn.normalize(text)  # <2>neologdn正規化(全角半角)
    text = text.lower()  # <3>小文字に統一

    node = tagger.parseToNode(text)
    result = []
    while node:
        features = node.feature.split(',')

        if features[0] != 'BOS/EOS':
            if features[0] not in ['助詞', '助動詞', '記号']:  # <4>ストップワード除去
                token = features[6] \
# coding: utf-8
import MeCab

mecab = MeCab.Tagger ('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

INPUT_FILE_PATH = "./scraping.csv"
OUTPUT_FILE_PATH = "./mecab.txt"

with open(INPUT_FILE_PATH) as f:
    text = f.read()

# mecab.parse('')#文字列がGCされるのを防ぐ
node = mecab.parseToNode(text)
while node:
    #単語を取得
    word = node.surface
    #品詞を取得
    pos = node.feature.split(",")[1]
    tmp_str = '{0} , {1}\n'.format(word, pos)

    with open(OUTPUT_FILE_PATH, mode='a') as f:
        f.write(tmp_str)
    
    #次の単語に進める
    node = node.next
Beispiel #7
0
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

# Regular expressions used to tokenize.
_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
_DIGIT_RE = re.compile(br"\d")

# URLs for WMT data.
_WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar"
_WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz"

tagger = MeCab.Tagger("-Owakati")

def maybe_download(directory, filename, url):
  """Download filename from url unless it's already in directory."""
  if not os.path.exists(directory):
    print("Creating directory %s" % directory)
    os.mkdir(directory)
  filepath = os.path.join(directory, filename)
  if not os.path.exists(filepath):
    print("Downloading %s to %s" % (url, filepath))
    filepath, _ = urllib.request.urlretrieve(url, filepath)
    statinfo = os.stat(filepath)
    print("Succesfully downloaded", filename, statinfo.st_size, "bytes")
  return filepath

Beispiel #8
0
 def __init__(self):
     self.tagger = MeCab.Tagger()
Beispiel #9
0
def wakati(text):
    t = MeCab.Tagger("-Owakati")
    m = t.parse(text)
    result = m.rstrip(" \n").split(" ")
    return result
    keywords = ["アレルギー"]
    if text.find(keywords[0]) != -1:
        return 1
    else:
        return 0


if __name__ == "__main__":
    param = sys.argv

    f = open(param[1], "r")
    texts = f.read()
    f.close()

    #Parsing won't help so fat
    m = MeCab.Tagger("-Owakati")
    #print parse_text(texts, m)

    #for i in dialogue:
    #print i.encode('utf-8') + separater

    #Stop wordsで区切る方法
    num = len(texts)
    stop_words = "すか たか"
    start_words = "です"

    seped = re.split('すか|です|たか|した', texts)
    #seped = re.split('すか|たか', texts)
    m = MeCab.Tagger("-Owakati")

    rand_id = random.randint(0, 100)
Beispiel #11
0
def tokenize(text):
    wakati = MeCab.Tagger("-Owakati")
    wakati.parse("")
    return wakati.parse(text).strip().split()
# -*- coding: utf-8 -*-
import os
import sys
import re
from gensim import corpora, matutils
import MeCab
import time

DATA_DIR_PATH = './data/text/'
DICTIONARY_FILE_NAME = 'livedoordic.txt'
mecab = MeCab.Tagger('mecabrc')


def get_class_id(file_name):
    dir_list = get_dir_list()
    dir_name = next(filter(lambda x: x in file_name, dir_list), None)
    if dir_name:
        return dir_list.index(dir_name)
    return None


def get_dir_list():
    tmp = os.listdir(DATA_DIR_PATH)
    if tmp is None:
        return None
    return sorted([x for x in tmp if os.path.isdir(DATA_DIR_PATH + x)])


def get_file_content(file_path):
    with open(file_path, encoding='utf-8') as f:
        return ''.join(f.readlines()[2:])  # ライブドアコーパスが3行目から本文はじまってるから
Beispiel #13
0
import MeCab
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import argparse
import smart_open
import re

parser = argparse.ArgumentParser()
parser.add_argument('faq', type=str)
#parser.add_argument('model', type=str)
parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary")
parser.add_argument("--stop_words", "-s", type=str, help="stop words list")
args = parser.parse_args()

mecab = MeCab.Tagger("-Owakati" +
                     ("" if not args.dictionary else " -d " + args.dictionary))

questions = []
sentences = []
originals = []
j = 0
for line in open(args.faq, "r", encoding="cp932"):  #utf-8
    cols = line.strip().split('\n')  #t
    questions.append(
        gensim.utils.simple_preprocess(mecab.parse(cols[0]).strip(),
                                       min_len=1))  #1
    originals.append(cols[0])
    sentences.append(
        models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
            mecab.parse(cols[0]).strip(), min_len=1),
                                      tags=["SENT_" + str(j)]))
Beispiel #14
0
"""
取得したツイートをWordCloudで可視化
"""

import csv
import MeCab
from wordcloud import WordCloud

# 参照:https://qiita.com/berry-clione/items/b3a537962c84244a2a09
dicdir = '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'
tagger = MeCab.Tagger(dicdir)

with open("./output/tweet_data", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    texts = []
    for row in reader:
        texts.append(row)

# 4パターンのWordCloudを作成
patterns = [[["名詞", "動詞", "形容詞"], "all"], [["名詞"], "noun"], [["動詞"], "verb"],
            [["形容詞"], "adjective"]]

# 形態素解析(Mecab) -> WordCloud 処理
for pattern in patterns:
    words = []
    for text in texts:
        text = " ".join(text)
        text = text.split("http")[0]  # http 以降はトリ(URLは最後に載せるパターンが多いため)
        node = tagger.parseToNode(text)
        while node:
            if node.feature.split(",")[0] in pattern[0]:
Beispiel #15
0
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import dill 

# MeCabの初期化
mecab = MeCab.Tagger()
mecab.parse('')

sents = []
labels = []

# generate-samples.txt の出力である samples.dat の読み込み
for line in open("da_samples.dat","r"):
    line = line.rstrip()
    # samples.dat は対話行為タイプ,発話文,タグとその文字位置が含まれている
    da, utt = line.split('\t')
    words = []
    for line in mecab.parse(utt).splitlines():
        if line == "EOS":
            break
        else:
            # MeCabの出力から単語を抽出
            word, feature_str = line.split("\t")
            words.append(word)
    # 空白区切りの単語列をsentsに追加
    sents.append(" ".join(words))
    # 対話行為タイプをlabelsに追加
    labels.append(da)
import re
import urllib.request

import MeCab
import mojimoji

from pathlib import Path

tagger = MeCab.Tagger(
    "-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")


def get_stopwords(save_dir="input/") -> set:
    path = Path(save_dir) / "stopwords.txt"
    if path.exists():
        with open(path) as f:
            stopwords = f.read().split("\n")
        return set(stopwords)
    url = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"
    stopwords = urllib.request.urlopen(url).read().decode("utf8")
    with open(path, "w") as f:
        f.write(stopwords)
    return set(stopwords.split("\n"))


def tokenizer(x: str, stopwords: set, include_verb=True) -> str:
    text = mojimoji.zen_to_han(x.replace("\n", ""), kana=False)
    parsed = tagger.parse(text).split("\n")
    parsed = [t.split("\t") for t in parsed]
    parsed = list(
        filter(
Beispiel #17
0
def converter(word):
    m = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

    return m.parse(word).split('\t')[1].split(',')[6]
Beispiel #18
0
        wtype2 = word.split('\t')[1].split(',')[1]  #品詞細分類1
        #名詞はそのまま,形容詞、動詞、副詞は原型を使用する
        if wtype == "名詞" and wtype2 in norns:
            return (word.split('\t')[0])
        elif wtype == "形容詞" and wtype2 in ["自立", "非自立"]:
            return (word.split('\t')[1].split(',')[6])
        elif wtype == "動詞" and wtype2 == "自立":
            return (word.split('\t')[1].split(',')[6])
        elif wtype == "副詞":
            return (word.split('\t')[1].split(',')[6])


if __name__ == "__main__":
    p = Path(__file__).parent.resolve() / "toots_log"
    file_paths = [f for f in p.iterdir()]
    m = MeCab.Tagger("-d /usr/lib/mecab/dic/mecab-ipadic-neologd")
    words = []
    #使用する品詞細分類1のリスト

    for file_path in tqdm(file_paths):
        with open(file_path, "r") as f:
            text = f.read()
        #カスタム絵文字を取り除く
        text = re.sub(r":[a-zA-Z0-9_-]+:", "", text)
        #分かち書きを行い単語の品詞が形容詞、動詞、名詞、副詞のみを取得する
        words.extend(
            [get_word(word) for word in m.parse(text).splitlines()[:-1]])
    words = collections.Counter(words)
    words = pd.DataFrame(words, index=["num"])
    words = words.T
    words = words[words["num"] > 199]
Beispiel #19
0
    def onExecute(self, ec_id):
        while self._xmldataIn.isNew():
            data = self._xmldataIn.read()
            data.data = data.data.decode('utf-8')
            speechdata = BeautifulSoup(data.data, "lxml")
            totaldata = []

            for data.data in speechdata.findAll('data'):
                rank = int(data.data['rank'])
                score = float(data.data['score'])
                text = data.data['text']
                xmldata = XMLSet(rank, score, text.encode("utf-8"))
                totaldata.append(xmldata)

            totalxmldata = sorted(totaldata, key=lambda x: x[1], reverse=True)
            print str(totalxmldata).decode('string-escape')
            highxmldata = totalxmldata[0]
            print(highxmldata[2])

            intextdata = highxmldata[2]

            taggerwakati = MeCab.Tagger("-Owakati")
            data_wakati = taggerwakati.parse(intextdata)
            list_wakati = data_wakati.split(' ')
            print str(list_wakati).decode('string-escape')
            self._d_wakati.data = list_wakati
            self._wakatiOut.write()

            taggerchasen = MeCab.Tagger("-Ochasen")
            taggerchasen.parse('')
            node = taggerchasen.parseToNode(intextdata)
            chasendata = []
            while node:
                resorg = node.feature.split(",")[6]
                ps = node.feature.split(",")[0]
                if ps == "名詞":
                    chasendata.append(resorg)
                if ps == "動詞":
                    chasendata.append(resorg)
                if ps == "形容詞":
                    chasendata.append(resorg)
                if ps == "副詞":
                    chasendata.append(resorg)
                if ps == "助詞":
                    chasendata.append(resorg)
                if ps == "接続詞":
                    chasendata.append(resorg)
                if ps == "助動詞":
                    chasendata.append(resorg)
                if ps == "連体詞":
                    chasendata.append(resorg)
                if ps == "感動詞":
                    chasendata.append(resorg)
                node = node.next
            chasendata.append("\n")

            print str(chasendata).decode('string-escape')
            self._d_chasen.data = chasendata
            self._chasenOut.write()

        while self._strdataIn.isNew():

            intext = self._strdataIn.read()
            intextdata = intext.data

            taggerwakati = MeCab.Tagger("-Owakati")
            data_wakati = taggerwakati.parse(intextdata)
            list_wakati = data_wakati.split(' ')
            print str(list_wakati).decode('string-escape')
            self._d_wakati.data = list_wakati
            self._wakatiOut.write()

            taggerchasen = MeCab.Tagger("-Ochasen")
            taggerchasen.parse('')
            node = taggerchasen.parseToNode(intextdata)
            chasendata = []
            while node:
                resorg = node.feature.split(",")[6]
                ps = node.feature.split(",")[0]
                if ps == "名詞":
                    chasendata.append(resorg)
                if ps == "動詞":
                    chasendata.append(resorg)
                if ps == "形容詞":
                    chasendata.append(resorg)
                if ps == "副詞":
                    chasendata.append(resorg)
                if ps == "助詞":
                    chasendata.append(resorg)
                if ps == "接続詞":
                    chasendata.append(resorg)
                if ps == "助動詞":
                    chasendata.append(resorg)
                if ps == "連体詞":
                    chasendata.append(resorg)
                if ps == "感動詞":
                    chasendata.append(resorg)
                node = node.next
            chasendata.append("\n")

            print str(chasendata).decode('string-escape')
            self._d_chasen.data = chasendata
            self._chasenOut.write()

        return RTC.RTC_OK
Beispiel #20
0
from os import path
from typing import Optional
from flask import Flask, abort, request, Response
import MeCab


CONFIG_PATH = path.join(path.dirname(path.abspath(__file__)), 'flask.cfg')
DIC_DIR = path.join('/', 'usr', 'local', 'lib', 'mecab', 'dic')


# Flask Application
app = Flask(__name__)
app.config.from_pyfile(CONFIG_PATH)

# MeCab
mecab = MeCab.Tagger(f"-d {path.join(DIC_DIR, 'mecab-ipadic-neologd')}")

@app.route('/', methods=['GET', 'POST'])
def parse():
    """Morphological Analysis by MeCab.

    Request Format:
        GET: /?sentence=アルミ缶の上にあるみかん
        POST: / -X "Content-Type: application/json"
            { "sentence": "アルミ缶の上にあるみかん" }
    """
    # STEP.1 Extraction of a given sentence
    sentence: Optional[str] = None
    try:
        if request.method == 'POST':
            sentence = request.json['sentence']
Beispiel #21
0
# coding: UTF-8

import MeCab

fin = open('neko.txt.mecab')
lines = fin.readlines()  # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
fin.close()
# lines: リスト。要素は1行の文字列データ

tagger = MeCab.Tagger("-Ochasen")
sentence = []
morpheme_set = []
morpheme = {}

for line in lines:
    morpheme_list = line.split('\t')
    surface = morpheme_list[0]
    if surface == "EOS\n":
        morpheme_set.append(sentence)
        sentence = []
    else:
        morpheme["surface"] = surface
        feature = morpheme_list[1].split(',')
        morpheme["base"] = feature[6]
        morpheme["pos"] = feature[0]
        morpheme["pos1"] = feature[1]
        sentence.append(morpheme.copy())

for sentence in morpheme_set:
    for morpheme in sentence:
        if (morpheme["pos"] == "動詞"):
Beispiel #22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import unicodedata

import nltk
import MeCab

MECAB = MeCab.Tagger("-Owakati")

JA_SYMBOLS = u'!?!?。☆★♡♥❤♪♬♫✿'
JA_TOKENIZER = nltk.tokenize.RegexpTokenizer(u'[^{0}]*([{0}]+|$)'.format(JA_SYMBOLS))
EN_SYMBOLS = u'☆★♡♥❤♪♬♫✿'
EN_TOKENIZER = nltk.tokenize.RegexpTokenizer(u'[^{0}]*([{0}]+|$)'.format(EN_SYMBOLS))

def normalize(s):
    return unicodedata.normalize('NFKC', s)

def sent_tokenize_ja(s):
    '''returns a list of strings'''
    return JA_TOKENIZER.tokenize(s)[:-1]

def sent_tokenize_en(s):
    '''returns a list of strings'''
    sentences = EN_TOKENIZER.tokenize(s)[:-1]
    sentencess = map(lambda sent: nltk.sent_tokenize(sent.strip()), sentences)
    return [sentence for sentences in sentencess for sentence in sentences]

def sent_tokenize(s, lang=None):
    if lang == 'en':
        return sent_tokenize_en(normalize(s))
Beispiel #23
0
import logging
from datetime import datetime
import urllib
from requests_oauthlib import OAuth1Session
import MeCab
import markovify

# Image draw
import io
from PIL import Image, ImageDraw, ImageFont

from . import generate_model

logger = logging.getLogger('django')
mec = MeCab.Tagger("-r /dev/null -d /usr/lib/mecab/dic/mecab-ipadic-neologd -O wakati")

class AuthRedirectAPIView(APIView):
    def get(self, request):
        if 'callback' not in request.query_params:
            return Response(
                {
                    'message': 'callback URL not specified!'
                },
                status.HTTP_400_BAD_REQUEST
            )
        oauth = OAuth1Session(settings.TWITTER_API_CONKEY, settings.TWITTER_API_CONSEC, None, None, request.query_params['callback'])
        oauth.fetch_request_token("https://api.twitter.com/oauth/request_token")
        url = oauth.authorization_url("https://api.twitter.com/oauth/authenticate")
        return redirect(url)
Beispiel #24
0
import MeCab

m = MeCab.Tagger()
out = m.parse("안녕하세요")
print(out)
Beispiel #25
0
import MeCab

wakati = MeCab.Tagger('-Owakati')  #分かち書き
neo_wakati = MeCab.Tagger(
    '-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')  #追加辞書を適用
word = input("分かち書き:")

wakati = wakati.parse(word).strip()
neo_wakati = neo_wakati.parse(word).strip()

print('通常辞書:' + wakati)
print('追加辞書:' + neo_wakati)
import tensorflow as tf
from seq2seq_model import Seq2SeqModel
import json
import MeCab
import os
import numpy as np
from util import parse_file, sentence_to_word_id, create_buckets, _buckets, EOS, ignore_list

tagger = MeCab.Tagger("mecabrc")

id2word = json.load(open("dictionary_i2w.json", "r"))
word2id = json.load(open("dictionary_w2i.json", "r"))

_buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]

# 学習データと辞書の取得
questions, answers, _, _ = parse_file("../data/conversation_data.txt")
# 文章をidの配列に変換する
print(questions)
ids_questions = sentence_to_word_id(questions, word2id=word2id)
print(ids_questions)

vocab_size = len(word2id) + 3
print(vocab_size)

ckpt = tf.train.get_checkpoint_state("./tmp")
print(ckpt)
print(tf.train.checkpoint_exists("./tmp/model.ckpt-5000"))

with tf.Session() as sess:
    print('init model')
Beispiel #27
0
import MeCab
mecab = MeCab.Tagger('-Ochasen')
print(mecab.parse('このソフトクリームとってもおいしくない'))

#mecabrc:(引数なし)
#-Ochasen: (ChaSen 互換形式)
#-Owakati: (分かち書きのみを出力)
#-Oyomi: (読みのみを出力)
Beispiel #28
0
    def sort_random_list(self, phraseList):
        words = ""
        # random.shuffle(phraseList)
        for phrase in phraseList:
            for word in phrase:
                words = words + word
        return words


# test
if __name__ == '__main__':

    maker = Maker()
    m = MeCab.Tagger(
        r'-Owakati -d C:\Users\hori\workspace\encoder-decoder-sentence-chainer-master\mecab-ipadic-neologd'
    )
    """
    parser = Parser()
    
    wordsList = []
    sentenceList = []
    for i in range(20000):
        index = random.randint(1, 18)
        words, sentence = maker.generate_word_sentence(index)
        words = m.parse(words)[:-2].split(' ')
        sentence = m.parse(sentence)[:-2].split(' ')
        for i in range(len(words)):
            w = words[i]
            words[i] = parser.parse(w)
        for i in range(len(sentence)):
Beispiel #29
0
 def parse(self, sentence):
     me = MeCab.Tagger()
     s = me.parse(sentence)
     return s
Beispiel #30
0
import MeCab

mecab = MeCab.Tagger("-Ochasen")


# テキストを引数として、形態素解析の結果、名詞・動詞・形容詞(原形)のみを配列で抽出する関数を定義
def extract_words(text):
    node = mecab.parseToNode(text)
    words = []
    while node:
        word = node.feature.split(",")[6]  #原形
        word_type = node.feature.split(",")[0]  #品詞
        #print(word + ": " + word_type)
        if word_type in ["名詞", "動詞", "形容詞"]:
            words.append(word)
            #print(word)
        node = node.next
    return words


#  関数テスト
text = '三四郎は京都でちょっと用があって降りたついでに。誰かが困っている時に来るのです。'

# 全体のテキストを句点('。')で区切った配列にする。
sentences = text.split('。')
# それぞれの文章を単語リストに変換(処理に数分かかります)
word_list = [extract_words(sentence) for sentence in sentences]

# 結果の一部を確認
for word in word_list[1]:
    print(word)