コード例 #1
0
def main():
    args = sys.argv

    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = 'models/cnn_model.h5'
    num_words = 40000
    num_label = 2

    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv')

    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post')

    emb_flg = args[0]
    if emb_flg == 't':
        wv = load_fasttext('../chap08/models/cc.ja.300.vec.gz')
        wv = filter_embeddings(wv, vocab.word_index, num_words)
    else:
        wv = None

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])

    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks,
              shuffle=True)

    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)

    print('precision: {:.4f}'.format(
        precision_score(y_test, y_pred, average='binary')))
    print('recall   : {:.4f}'.format(
        recall_score(y_test, y_pred, average='binary')))
    print('f1   : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
コード例 #2
0
def main():
    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = "cnn_model.h5"
    num_words = 40000
    num_label = 2

    x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv")

    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post")
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post")

    wv = load_fasttext("data/cc.ja.300.vec.gz")
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    model = CNNModel(num_words, num_label, embeddings=wv).build()
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["acc"])

    callbakcs = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbakcs,
              shuffle=True)

    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequence(x_test)
    print("precision: {:.4f}".format(
        precision_score(y_test, y_pred, average="binary")))
    print("recall: {:.4f}".format(
        recall_score(y_test, y_pred, average="binary")))
    print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
コード例 #3
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model_{}.h5'
    num_words = 15000

    # Data loading.
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # Pre-processing.
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # Build models.
    models = [
        UnidirectionalModel(num_words, target_vocab.size).build(),
        BidirectionalModel(num_words, target_vocab.size).build(),
    ]
    for i, model in enumerate(models):
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(i), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(i))
        api = InferenceAPI(model, source_vocab, target_vocab)
        y_pred = api.predict_from_sequences(x_test)
        print(classification_report(y_test, y_pred, digits=4))
コード例 #4
0
ファイル: train.py プロジェクト: greatestgoat/nlp_knowledge
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    # model_path = 'models/unidirectional_model.h5'
    model_path = 'models/bidirectional_model.h5'
    num_words = 15000

    # データ・セットの読み込み
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # データ・セットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # モデルの構築
    # model = UnidirectionalModel(num_words, target_vocab.size).build()
    model = BidirectionalModel(num_words, target_vocab.size).build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)

    # 予測と評価
    model = load_model(model_path)
    api = InferenceAPI(model, source_vocab, target_vocab)
    y_pred = api.predict_from_sequences(x_test)
    print(classification_report(y_test, y_pred, digits=4))
コード例 #5
0
def main():
    # hyper parameter setting 
    emb_dim = 50 
    epochs = 2
    model_path = 'model.h5'
    negative_samples = 1
    num_words = 10000 
    window_size = 1
    
    # corpus
    text = load_data(filepath = '../chap04/data/ja.text8')
    
    # vocablary 
    vocab = build_vocablary(text, num_words) 
    
    # create dataset 
    x, y = create_dataset(text, vocab, num_words, window_size, negative_samples)
    
    # construction of model
    model = EmbeddingModel(num_words, emb_dim)
    model = model.build()
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    
    # callback 
    callbacks = [
        EarlyStopping(patience=1),
        ModelCheckpoint(model_path, save_best_only=True)
    ]
    
    # model 
    model.fit(x=x,y=y,
              batch_size=128,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks)
    
    # prediction 
    model = load_model(model_path)
    api = InferenceAPI(model, vocab)
    pprint(api.most_similar(word='日本'))
コード例 #6
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'atmodel.h5'
    enc_arch = 'encoder.json'
    dec_arch = 'decoder.json'
    data_path = '../data/w16to19hukusimaconv.txt'
    num_words = 7000
    num_data = 4367

    # Data loading.
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # Preprocessings.
    #ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    print(x_train[:3])
    print(y_train[:3])
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    print(en_vocab.word_index)
    print(ja_vocab.word_index)

    # Build a simple model.
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    # Build an attention model.
    #encoder = Encoder(num_words, return_sequences=True)
    #decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model.
    callbacks = [
        EarlyStopping(patience=10),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    """
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)"""
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # Inference.
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\
            "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\
            "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\
            "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です"
    ]
    for text in texts:
        decoded = api.predict(text=text)
        print('入力: {}'.format(text))
        print('応答: {}'.format(decoded))

    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))
コード例 #7
0
def main():
    # ハイパーパラメータの背一定
    batch_size = 128
    epochs = 100
    maxlen = 300
    # model_path = 'models/rnn_model.h5'
    # model_path = 'models/lstm_model.h5'
    # model_path = 'models/CNN_model.h5'
    model_path = 'models/latm_iniemb_model.h5'
    num_words = 4000
    num_label = 2

    # データ・セットの読み込み
    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv')

    # データセットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post')

    # 単語分散表現
    wv = load_fasttext('data/cc.ja.300.vec')
    wv = filter_embeddings(wv, vocab.word_index, num_words)

    # モデルの構築
    # model = RNNModel(num_words, num_label, embeddings=None).build()
    model = LSTMModel(num_words, num_label, embeddings=wv).build()
    # model = CNNModel(num_words, num_label, embeddings=None).build()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks,
              shuffle=True)

    # 予測
    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)
    print('precision: {:.4f}'.format(
        precision_score(y_test, y_pred, average='binary')))
    print('recall: {:.4f}'.format(
        recall_score(y_test, y_pred, average='binary')))
    print('f1: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
コード例 #8
0
def main():
    startime = time.time()
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model.h5'
    enc_arch = 'models/encoder.json'
    dec_arch = 'models/decoder.json'
    data_path = 'data/jpn.txt'
    num_words = 10000
    num_data = 20000

    # Data loading.
    print(return_time(startime), "1. Loading data ...")
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # Preprocessings.
    print(return_time(startime), "2. Preprocessing dataset ...")
    ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)
    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    # Build a simple model.
    print(return_time(startime), "3. Build model ...")
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    # Build an attention model.
    # encoder = Encoder(num_words, return_sequences=True)
    # decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model.
    print(return_time(startime), "4. Start training ...")
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # Inference.
    print(return_time(startime), "5. Evaluation")
    print("***********************************")
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    # api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    for text in texts:
        decoded = api.predict(text=text)
        print('English : {}'.format(text))
        print('Japanese: {}'.format(decoded))
        print()

    print(return_time(startime), "6. Calculating BLEU score ...")
    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))

    print(return_time(startime), "7. Finished!")
コード例 #9
0
ファイル: train.py プロジェクト: greatestgoat/nlp_knowledge
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    model_path = 'models/simple_model.h5'
    enc_arch = 'models/encoder.json'
    dec_arch = 'models/decoder.json'
    data_path = 'data/jpn.txt'
    num_words = 10000
    num_data = 20000

    # データ・セット読み込み
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # データ・セットの前処理
    ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    # モデルの構築
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # 予測
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    for text in texts:
        decoded = api.predict(text=text)
        print('English : {}'.format(text))
        print('Japanese: {}'.format(decoded))

    # 性能評価
    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))
コード例 #10
0
    text = load_data(filepath='data/ja.text8')

    # ボキャブラリの構築
    vocab = build_vocabulary(text, num_words)

    # データ・セットの作成
    x, y = create_dataset(text, vocab, num_words, window_size, negative_samples)

    # モデルの構築
    model = EmbeddingModel(num_words, emb_dim)
    model = model.build()
    model.compile(optimizer='adam', loss='binary_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=1),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x,
            y=y,
            batch_size=128,
            epochs=epochs,
            validation_split=0.2,
            callbacks=callbacks)

    # 予測
    model = load_model(model_path)
    api = InferenceAPI(model, vocab)
    pprint(api.most_similar(word='日本'))
コード例 #11
0
def main(args):
    print(args)
    startime = time.time()
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

    # Set hyper-parameters.
    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = 'models/model_{}.h5'
    num_words = 40000
    num_label = 2

    # Data loading.
    print(return_time(startime), "1. Loading data ...")
    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv')

    # pre-processing.
    print(return_time(startime), "2. Preprocessing dataset ...")
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                        test_size=0.2,
                                                        random_state=42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post')

    # Preparing word embedding.
    if args.loadwv:
        print(return_time(startime), "3. Loading word embedding ...")
        wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words)
        if os.path.exists(wv_path):
            wv = np.load(wv_path)
            print(return_time(startime), "Loaded word embedding successfully!")
        else:
            print(return_time(startime), "Word embedding file doesn't exist")
            exit()

    else:
        print(return_time(startime), "3. Preparing word embedding ...")
        wv = load_fasttext('data/cc.ja.300.vec.gz')
        wv = filter_embeddings(wv, vocab.word_index, num_words)
        # Saving word embedding.
        if args.savewv:
            wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words)
            np.save(wv_path, wv)
            print(return_time(startime), "Saved word embedding successfully!", wv_path)

    # Build models.
    models = [
        RNNModel(num_words, num_label, embeddings=None).build(),
        LSTMModel(num_words, num_label, embeddings=None).build(),
        CNNModel(num_words, num_label, embeddings=None).build(),
        RNNModel(num_words, num_label, embeddings=wv).build(),
        LSTMModel(num_words, num_label, embeddings=wv).build(),
        CNNModel(num_words, num_label, embeddings=wv).build(),
        CNNModel(num_words, num_label, embeddings=wv, trainable=False).build()
    ]

    model_names = [
        "RNN-None",
        "LSTM-None",
        "CNN-None",
        "RNN-wv",
        "LSTM-wv",
        "CNN-wv",
        "CNN-wv-notrain"
    ]

    print(return_time(startime), "4. Start training ...")
    for i, model in enumerate(models):
        print("***********************************")
        print(return_time(startime), "Model:", model_names[i])

        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['acc'])

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(model_names[i]), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.2,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(model_names[i]))
        api = InferenceAPI(model, vocab, preprocess_dataset)
        y_pred = api.predict_from_sequences(x_test)
        print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average='binary')))
        print('recall   : {:.4f}'.format(recall_score(y_test, y_pred, average='binary')))
        print('f1       : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))