Beispiel #1
0
def train(embedding='bert', trial='trial', ckpt=None, verbose=False, plot=False):
    x_train = np.load('data/x_train_' + embedding + '.npy', allow_pickle=True)
    y_train = np.load('data/y_train_' + embedding + '.npy', allow_pickle=True)
    x_val = np.load('data/x_dev_' + embedding + '.npy', allow_pickle=True)
    y_val = np.load('data/y_dev_' + embedding + '.npy', allow_pickle=True)
    if verbose:
        print(y_train.shape)
        print(x_train.shape)

    model = lstm_model(_SEQ_SHAPE, load_weights=ckpt, verbose=verbose)
    
    save_best_model = ModelCheckpoint('models/'+trial+'_weights.{epoch:02d}-{val_loss:.2f}.hdf5',
                                    monitor='val_loss', verbose=0, save_best_only=True, period=1)
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=0)
    #tensorboard = TensorBoard(log_dir='logs', write_graph=True)

    out = model.fit(x_train, y_train, epochs=20, callbacks=[save_best_model, early_stopping], #, tensorboard],
                    batch_size=64, validation_data=(x_val, y_val), verbose=2 if verbose==True else 0)
    if plot:
        # Summarize history for loss
        plt.figure()
        plt.plot(out.history['loss'])
        plt.plot(out.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper left')
        plt.show()
Beispiel #2
0
def train(sequence_length,
          image_shape,
          batch_size,
          nb_epoch):
    
    filepath = os.path.join('data', 'checkpoints', 'ConvLSTM.{epoch:03d}-{mse:.5f}.hdf5')
    
    # helper: save model 
    checkpointer = ModelCheckpoint(filepath = filepath,
                                    monitor='mse',
                                    verbose=2,
                                    save_best_only=True,
                                    save_weights_only=False,
                                    mode='auto')
    
    # helper: stop training when model converges
    early_stopper = EarlyStopping(monitor='mse',
                              min_delta=0,
                              patience=10,
                              restore_best_weights=True)
    
    # Get the training data
    data = DataSet(
        sequence_length=sequence_length,
        image_shape=image_shape)
    
    # Get samples per epoch.
    # Multiply by 0.7 to estimate how much data is the train set
    steps_per_epoch = (len(data.data) * 0.70) // batch_size
    # Multiply by 0.3 to estimate how much data is the validation set
    validation_steps = (len(data.data) * 0.30) // batch_size

    # Data generators
    generator = data.frame_generator(batch_size, 'train', augment = True)
    val_generator = data.frame_generator(batch_size, 'test', augment = False)

    # Get the model
    model = lstm_model()

    # Train the model
    history = model.fit_generator(generator=generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=nb_epoch,
                            verbose=0,
                            callbacks=[early_stopper, checkpointer],
                            validation_data=val_generator,
                            validation_steps=validation_steps)

    # Close GPU session
    session.close()
Beispiel #3
0
def test(embedding='bert', ckpt=None, verbose=False):
    if ckpt is not None:
        latest_ckpt = ckpt
    else:
        list_of_files = glob.glob('models/*.hdf5')
        latest_ckpt = max(list_of_files, key=os.path.getatime)
    
    x_test = np.load('data/x_test_' + embedding + '.npy')
    y_test = np.load('data/y_test_' + embedding + '.npy')

    model = lstm_model(_SEQ_SHAPE, load_weights=latest_ckpt, verbose=verbose)

    scores = model.evaluate(x_test, y_test, verbose=0)
    return scores
Beispiel #4
0
def class_report(embedding='bert', ckpt=None, verbose=False):
    if ckpt is not None:
        latest_ckpt = ckpt
    else:
        list_of_files = glob.glob('models/*.hdf5')
        latest_ckpt = max(list_of_files, key=os.path.getatime)
        
    x_test = np.load('data/x_test_' + embedding + '.npy')
    y_test = np.load('data/y_test_' + embedding + '.npy')

    model = lstm_model(_SEQ_SHAPE, load_weights=latest_ckpt, verbose=verbose)
        
    # Confution Matrix and Classification Report
    Y_pred = model.predict(x_test, batch_size=64)
    y_pred = np.argmax(Y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)
    #print(y_true)
    print('Confusion Matrix')
    confusion = confusion_matrix(y_true, y_pred, labels=[*range(5)], normalize='true')
    print(confusion)
    print('Classification Report')
    print(classification_report(y_true, y_pred, labels=[*range(5)]))
    return confusion
Beispiel #5
0
def lstm_model(body_length, numb_epoch):
    prepro = Preprocessing()
    data = load_data()

    # Loading train data from files
    data.set_path(path='fnc-1-master')
    train_stance_data = data.get_headline_body_stance()
    train_bodies_data = data.get_body_id_text()
    train_headlines, train_bodies, train_stances = data.get_mapped_id_body(train_stance_data, train_bodies_data)

    # Removing punctuation and stop words from the headline and body of train data
    train_headlines_cl = prepro.get_clean_data(train_headlines)
    train_bodies_cl = prepro.get_clean_data(train_bodies)
    train_stances_cl = prepro.get_clean_data(train_stances)

    # Convert labels to integer
    train_stances_in = prepro.convert_lable_int(train_stances_cl)

    # Load the test data
    data.set_name("test")
    test_stance_data = data.get_headline_body_stance()
    test_bodies_data = data.get_body_id_text()
    test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data, test_bodies_data, data_type="test")

    # Removing punctuation and stop words from the headline and body of test data
    test_headlines_cl = prepro.get_clean_data(test_headlines)
    test_bodies_cl = prepro.get_clean_data(test_bodies)

    # Set the tokenizer
    total_text = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl
    token = Tokenizer(num_words=30000)
    token.fit_on_texts(total_text)
    print('Number of Unique words: ' + str(len(token.word_index.keys())))

    # Convert headline and body to sequence
    train_headlines_seq = token.texts_to_sequences(train_headlines_cl)
    train_bodies_seq = token.texts_to_sequences(train_bodies_cl)
    word_index = token.word_index

    # Padding the headline and body
    train_headlines_seq = pad_sequences(train_headlines_seq, maxlen=MAX_HEADLINE_LENGTH)
    train_bodies_seq = pad_sequences(train_bodies_seq, maxlen=int(body_length))

    # Converting the labels to one hot encoder
    onehotencoder = OneHotEncoder()
    train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray()

    # Splitting data to train and validation
    train_headlines_final, headlines_val, train_bodies_final, bodies_val, train_stances_final, stances_val = \
        train_test_split(train_headlines_seq, train_bodies_seq, train_stances_in, test_size=0.2, random_state=42)

    # Convert headline and body to sequence
    test_headlines_seq = token.texts_to_sequences(test_headlines_cl)
    test_bodies_seq = token.texts_to_sequences(test_bodies_cl)

    # Padding the headline and body
    test_headlines_seq = pad_sequences(test_headlines_seq, maxlen=MAX_HEADLINE_LENGTH)
    test_bodies_seq = pad_sequences(test_bodies_seq, maxlen=int(body_length))

    # Getting embedding index
    embeddings_index = models.get_embeddings_index(GLOVE_DIR)

    print('Found %s word vectors.' % len(embeddings_index))

    # Getting embedding matrix
    embedding_matrix = models.get_embedding_matrix(embedding_dim=EMBEDDING_DIMENSION, embeddings_index=embeddings_index,
                                                   word_index=word_index)

    # Getting the model
    fake_nn = models.lstm_model(headline_length=MAX_HEADLINE_LENGTH, body_length=int(body_length),
                                embedding_dim=EMBEDDING_DIMENSION, word_index=word_index, embedding_matrix=embedding_matrix,
                                activation='relu',
                                drop_out=0.5, numb_layers=100, cells=200)

    fake_nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

    # Early stopping and model checkpoint
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    bst_model_path = 'Fake_news_nlp.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

    # Fitting the model
    fake_hist = fake_nn.fit([train_headlines_final, train_bodies_final], train_stances_final, batch_size=128,
                            epochs=int(numb_epoch), shuffle=True, validation_data=([headlines_val, bodies_val], stances_val),
                            callbacks=[early_stopping, model_checkpoint])

    # Storing the training and validation accuracy and loss in file for plot
    lstm_data = []
    with open(os.path.join(OBJECT_DUMP, "lstm_seperate_headline_body_" + str(body_length) + ".txt"), 'wb') as bow_hist:
        lstm_data.append(fake_hist.history['acc'])
        lstm_data.append(fake_hist.history['val_acc'])
        lstm_data.append(fake_hist.history['loss'])
        lstm_data.append(fake_hist.history['val_loss'])
        pickle.dump(lstm_data, bow_hist)

    # Predict the labels for test data
    result = fake_nn.predict([test_headlines_seq, test_bodies_seq], batch_size=128)

    # Store the results in the result file
    result_str = prepro.convert_lable_string(result)
    with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file:
        test_stance = csv.DictReader(read_file)
        with io.open(RESULT_FILE + "_" + str(body_length) + ".csv", mode='w',
                     encoding='utf8') as write_file:
            writer = csv.DictWriter(write_file, fieldnames=['Headline', 'Body ID', 'Stance'])
            writer.writeheader()
            for sample, prediction in zip(test_stance, result_str):
                writer.writerow({'Body ID': sample['Body ID'], 'Headline': sample['Headline'], 'Stance': prediction})

    # Print the Accuracy, competition score and confusion matrix
    print_result("fnc-1-master/competition_test_stances.csv", RESULT_FILE + "_" + str(body_length) + ".csv")
Beispiel #6
0
def run(ycol=0, method='gru'):
    '''
    运行主程序
    '''
    start = time.time()
    predict_win = 1
    #  数据处理
    if 'lstm' in method or 'gru' in method:
        timesteps = 1
    elif 'rnn' in method:
        timesteps = 8
    stock_win = dp.createWinData(price, ycol, timesteps, predict_win)
    train, test = dp.splitData(stock_win, ratio=0.9)
    scaler = StandardScaler()
    scaler.fit(train)
    train_norm = scaler.transform(train)
    test_norm = scaler.transform(test)
    #last_value = train[-1, -predict_win:]  # 测试集上一个数据
    #np.random.shuffle(train_norm)  #  打乱训练数据
    #y_tr = dp.transUpDown(train_norm[:,-predict_win:], last_value)
    y_tr = train_norm[:, -1]
    x_tr = train_norm[:, :-1]
    #y_te = dp.transUpDown(test_norm[:,-predict_win:], last_value)
    y_te = test_norm[:, -1]
    x_te = test_norm[:, :-1]
    x_tr = np.reshape(
        x_tr, (x_tr.shape[0], timesteps, int(x_tr.shape[1] / timesteps)))
    x_te = np.reshape(
        x_te, (x_te.shape[0], timesteps, int(x_te.shape[1] / timesteps)))
    y_tr = np.reshape(y_tr, (y_tr.shape[0], 1))
    y_te = np.reshape(y_te, (y_te.shape[0], 1))
    layers = [x_tr.shape[0], x_tr.shape[1], x_tr.shape[2], predict_win]
    # 模型建立
    if 'lstm' in method:
        model = models.lstm_model(layers)
    elif 'rnn' in method:
        model = models.rnn_model(layers)
    elif 'gru' in method:
        model = models.gru_model(timesteps, 28)
    # 模型训练
    earlystop = EarlyStopping(monitor='val_loss',
                              patience=10,
                              verbose=0,
                              mode='min')
    history = LossHistory()
    model.fit(x_tr,
              y_tr,
              epochs=200,
              batch_size=16,
              shuffle=True,
              verbose=1,
              validation_split=0.1,
              callbacks=[earlystop, history])
    score = model.evaluate(x_te, y_te, batch_size=16)
    print('test score:', score)
    #history.loss_plot('epoch')
    #  模型测试
    #y_tr_pre = model.predict_classes(x_tr, batch_size = 32)
    y_te_pre = model.predict(x_te, batch_size=32)
    true = y_te * scaler.scale_[-1] + scaler.mean_[-1]
    predict = y_te_pre * scaler.scale_[-1] + scaler.mean_[-1]
    #y = np.concatenate((y_tr, y_te))
    #y = y.reshape(y.shape[0])
    #dp.plotCompare(y, y_tr_pre.reshape(y_tr_pre.shape[0]),
    #               y_te_pre.reshape(y_te_pre.shape[0]), scaler)
    dp.plotTestCompare(true, predict)
    dp.errorPercentage(true, predict)
    print('the stock:', ycol)
    print('the method:', method)
    print('timesteps:', timesteps)
    #  预测结果
    #print('train')
    #dp.plotUpDown(y_tr, y_tr_pre)
    #print('test')
    #dp.plotUpDown(y_te, y_te_pre)
    '''
    test_len = len(y_te)
    diff, diff_pre = [], []
    for i in range(test_len):
        if i == 0:
            diff.append((true[0] - last_value) * 100 / last_value)
            diff_pre.append((predict[0] - last_value) * 100 / last_value)
        else:
            diff.append((true[i] - true[i - 1]) * 100 / true[i - 1])
            diff_pre.append((predict[i] - true[i - 1]) * 100 / true[i - 1])
    plt.scatter(diff, diff_pre)
    plt.grid(True)
    plt.xlabel('diff')
    plt.ylabel('diff_pre')
    plt.title('Scatter Plot')
    plt.show()            
    diff_label = dp.transUpDown1(diff)
    diff_pre_label = dp.transUpDown1(diff_pre)
    dp.plotUpDown(diff_label, diff_pre_label)
    '''
    print('time cost:', time.time() - start)
TrueCount2 = secondWord.value_counts().index.tolist()[:10]
#FalseCount2 = secondWordFalse2.value_counts().index.tolist()[:10]

TrueCountNGram = []
for x in itertools.product(TrueCount, TrueCount2):
    k = ' '.join(x)
    TrueCountNGram.append(k)

numOfWords = 2000
tokenizerPos = Tokenizer(num_words=numOfWords)

gen = Generator(df1, TrueCount, TrueCountNGram, stop)
#genT = Generator(dfV)
X_input = gen.generate(6).__next__()
epochs = 5

model = lstm_model()

model.fit_generator(gen.generate(200),
                    epochs=epochs,
                    verbose=1,
                    steps_per_epoch=500)

model_yaml = model.to_yaml()
with open("model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
Beispiel #8
0
data_dir = os.path.join(init_path, 'datasets')
train_test_path = os.path.join(data_dir, train_test_name)
stopwords_path = os.path.join(data_dir, stopwords_name)

save_dir = os.path.join(init_path, 'saved_models')
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
tokenizer_path = os.path.join(save_dir, tokenizer_name)

# Model
last_layer_options = m.get_last_layer_options(num_classes)
model = m.lstm_model(last_layer_options=last_layer_options,
                     num_features=max_features,
                     embedding_dim=embedding_dim,
                     sequence_len=sequence_len,
                     lstm_units=lstm_units,
                     dropout_rate=dropout_rate,
                     fc_units=fc_units)
model.summary()
model.compile(optimizer=optimizer, loss=last_layer_options[2], metrics=metric)

# Data
df_data = pd.read_csv(train_test_path, header=0)
print(f'原始数据集的形状为: {df_data.shape}')
df_data['review'] = df_data['review'].astype('str')
print('计算每个类别评论长度的中位数、平均值、标准差:')
df_data['len'] = df_data['review'].map(lambda x: len(x))
print(df_data[['cat', 'len']].groupby(by='cat').agg(
    ['median', 'mean', 'max', 'min', 'std']))
Beispiel #9
0
from models import lstm_model
from models import train_model
from preprocess import load_data
from preprocess import load_fast_text_embedding
from preprocess import get_embed_weights

max_len = 50
embedding_dim = 300
tokenizer, x_train, y_train, x_test, y_test, vocab_size = load_data(
    'sarcasm_v2.csv')

embedding_index = load_fast_text_embedding('wiki-news-300d-1M-subword.vec')
embedding_matrix = get_embed_weights(embedding_index, tokenizer)
model = lstm_model(vocab_size, embedding_matrix)
train_model(model, x_train, y_train, x_test, y_test)

#half embeddingd
# Validation Loss:1.128701367992565 	Validation Accuracy:0.6840490793889286
# Validation Accuracy:68.40% (+/- 0.00%)
# Train Loss:1.1293133928731907 	Train Accuracy:68.40490797546013
# Test Loss:1.110706667958593 	Test Accuracy:71.16564420834641
Beispiel #10
0
if args == "1":
    print('hola2')
    # Preprocessing of data
    X_train, y_train, X_val, y_val = data_prep_task1(timesteps=60)

    # Hyperparameters
    lr = 0.001
    epochs = 100
    n_batches = 16

    # call model, compile and fit
    Model = lstm_model(40,
                       dropout=True,
                       dr=0.2,
                       n_batches=16,
                       input_size=X_train.shape[1],
                       input_dimension=1,
                       bidirectional=False)
    Model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    History = Model.fit(X_train,
                        y_train,
                        batch_size=n_batches,
                        epochs=epochs,
                        validation_data=(X_val, y_val),
                        verbose=1,
                        shuffle=False)

    # plot evaluation of results
    #plot_model(History)