def main(): args = sys.argv batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/cnn_model.h5' num_words = 40000 num_label = 2 x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') emb_flg = args[0] if emb_flg == 't': wv = load_fasttext('../chap08/models/cc.ja.300.vec.gz') wv = filter_embeddings(wv, vocab.word_index, num_words) else: wv = None model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(): batch_size = 128 epochs = 100 maxlen = 300 model_path = "cnn_model.h5" num_words = 40000 num_label = 2 x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post") x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post") wv = load_fasttext("data/cc.ja.300.vec.gz") wv = filter_embeddings(wv, vocab.word_index, num_words) model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"]) callbakcs = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbakcs, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequence(x_test) print("precision: {:.4f}".format( precision_score(y_test, y_pred, average="binary"))) print("recall: {:.4f}".format( recall_score(y_test, y_pred, average="binary"))) print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
def train(): df_tweets = pd.read_csv("data/df_tweets", index_col=0) df_tweets["text"] = preprocess_dataset(df_tweets["text"]) df_tweets = df_tweets.dropna(how='any') df_tweets = df_tweets.drop(df_tweets.index[df_tweets["Irrelevant"] == 1]) x = df_tweets["text"] # y = df_tweets[["posi_and_nega", "posi", "nega", "neutral", "Irrelevant"]] y = df_tweets[["posi_and_nega", "posi", "nega", "neutral"]] y = np.asarray(y) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) with open('model/tokenizer.pickle', 'wb') as handle: pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post") x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post") wv = KeyedVectors.load("model/word2vec.model", mmap='r') wv = filter_embeddings(wv, vocab.word_index, num_words) model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"]) callbakcs = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.5, callbacks=callbakcs, shuffle=True)
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 model_path = 'models/attention_model.h5' enc_arch = 'models/encoder.json' dec_arch = 'models/decoder.json' data_path = 'data/jpn.txt' num_words = 10000 num_data = 20000 # データ・セット読み込み en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # データ・セットの前処理 ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) # モデルの構築 encoder = Encoder(num_words, return_sequences=True) decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1) encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # 予測 encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) for text in texts: decoded = api.predict(text=text) print('English : {}'.format(text)) print('Japanese: {}'.format(decoded)) # 性能評価 y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'atmodel.h5' enc_arch = 'encoder.json' dec_arch = 'decoder.json' data_path = '../data/w16to19hukusimaconv.txt' num_words = 7000 num_data = 4367 # Data loading. en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # Preprocessings. #ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) print(x_train[:3]) print(y_train[:3]) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) print(en_vocab.word_index) print(ja_vocab.word_index) # Build a simple model. encoder = Encoder(num_words) decoder = Decoder(num_words) # Build an attention model. #encoder = Encoder(num_words, return_sequences=True) #decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the model. callbacks = [ EarlyStopping(patience=10), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] """ model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1)""" encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # Inference. encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\ "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\ "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\ "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です" ] for text in texts: decoded = api.predict(text=text) print('入力: {}'.format(text)) print('応答: {}'.format(decoded)) y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
def main(): # ハイパーパラメータの背一定 batch_size = 128 epochs = 100 maxlen = 300 # model_path = 'models/rnn_model.h5' # model_path = 'models/lstm_model.h5' # model_path = 'models/CNN_model.h5' model_path = 'models/latm_iniemb_model.h5' num_words = 4000 num_label = 2 # データ・セットの読み込み x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # データセットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # 単語分散表現 wv = load_fasttext('data/cc.ja.300.vec') wv = filter_embeddings(wv, vocab.word_index, num_words) # モデルの構築 # model = RNNModel(num_words, num_label, embeddings=None).build() model = LSTMModel(num_words, num_label, embeddings=wv).build() # model = CNNModel(num_words, num_label, embeddings=None).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # 予測 model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall: {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
from utils import load_data if __name__ == '__main__': # ハイパーパラメータの設定 emb_dim = 50 epochs = 10 model_path = 'model.h5' negative_samples = 1 num_words = 10000 window_size = 1 # コーパスの読み込み text = load_data(filepath='data/ja.text8') # ボキャブラリの構築 vocab = build_vocabulary(text, num_words) # データ・セットの作成 x, y = create_dataset(text, vocab, num_words, window_size, negative_samples) # モデルの構築 model = EmbeddingModel(num_words, emb_dim) model = model.build() model.compile(optimizer='adam', loss='binary_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=1), ModelCheckpoint(model_path, save_best_only=True) ]