def train(embedding='bert', trial='trial', ckpt=None, verbose=False, plot=False): x_train = np.load('data/x_train_' + embedding + '.npy', allow_pickle=True) y_train = np.load('data/y_train_' + embedding + '.npy', allow_pickle=True) x_val = np.load('data/x_dev_' + embedding + '.npy', allow_pickle=True) y_val = np.load('data/y_dev_' + embedding + '.npy', allow_pickle=True) if verbose: print(y_train.shape) print(x_train.shape) model = lstm_model(_SEQ_SHAPE, load_weights=ckpt, verbose=verbose) save_best_model = ModelCheckpoint('models/'+trial+'_weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, period=1) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=0) #tensorboard = TensorBoard(log_dir='logs', write_graph=True) out = model.fit(x_train, y_train, epochs=20, callbacks=[save_best_model, early_stopping], #, tensorboard], batch_size=64, validation_data=(x_val, y_val), verbose=2 if verbose==True else 0) if plot: # Summarize history for loss plt.figure() plt.plot(out.history['loss']) plt.plot(out.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'validation'], loc='upper left') plt.show()
def train(sequence_length, image_shape, batch_size, nb_epoch): filepath = os.path.join('data', 'checkpoints', 'ConvLSTM.{epoch:03d}-{mse:.5f}.hdf5') # helper: save model checkpointer = ModelCheckpoint(filepath = filepath, monitor='mse', verbose=2, save_best_only=True, save_weights_only=False, mode='auto') # helper: stop training when model converges early_stopper = EarlyStopping(monitor='mse', min_delta=0, patience=10, restore_best_weights=True) # Get the training data data = DataSet( sequence_length=sequence_length, image_shape=image_shape) # Get samples per epoch. # Multiply by 0.7 to estimate how much data is the train set steps_per_epoch = (len(data.data) * 0.70) // batch_size # Multiply by 0.3 to estimate how much data is the validation set validation_steps = (len(data.data) * 0.30) // batch_size # Data generators generator = data.frame_generator(batch_size, 'train', augment = True) val_generator = data.frame_generator(batch_size, 'test', augment = False) # Get the model model = lstm_model() # Train the model history = model.fit_generator(generator=generator, steps_per_epoch=steps_per_epoch, epochs=nb_epoch, verbose=0, callbacks=[early_stopper, checkpointer], validation_data=val_generator, validation_steps=validation_steps) # Close GPU session session.close()
def test(embedding='bert', ckpt=None, verbose=False): if ckpt is not None: latest_ckpt = ckpt else: list_of_files = glob.glob('models/*.hdf5') latest_ckpt = max(list_of_files, key=os.path.getatime) x_test = np.load('data/x_test_' + embedding + '.npy') y_test = np.load('data/y_test_' + embedding + '.npy') model = lstm_model(_SEQ_SHAPE, load_weights=latest_ckpt, verbose=verbose) scores = model.evaluate(x_test, y_test, verbose=0) return scores
def class_report(embedding='bert', ckpt=None, verbose=False): if ckpt is not None: latest_ckpt = ckpt else: list_of_files = glob.glob('models/*.hdf5') latest_ckpt = max(list_of_files, key=os.path.getatime) x_test = np.load('data/x_test_' + embedding + '.npy') y_test = np.load('data/y_test_' + embedding + '.npy') model = lstm_model(_SEQ_SHAPE, load_weights=latest_ckpt, verbose=verbose) # Confution Matrix and Classification Report Y_pred = model.predict(x_test, batch_size=64) y_pred = np.argmax(Y_pred, axis=1) y_true = np.argmax(y_test, axis=1) #print(y_true) print('Confusion Matrix') confusion = confusion_matrix(y_true, y_pred, labels=[*range(5)], normalize='true') print(confusion) print('Classification Report') print(classification_report(y_true, y_pred, labels=[*range(5)])) return confusion
def lstm_model(body_length, numb_epoch): prepro = Preprocessing() data = load_data() # Loading train data from files data.set_path(path='fnc-1-master') train_stance_data = data.get_headline_body_stance() train_bodies_data = data.get_body_id_text() train_headlines, train_bodies, train_stances = data.get_mapped_id_body(train_stance_data, train_bodies_data) # Removing punctuation and stop words from the headline and body of train data train_headlines_cl = prepro.get_clean_data(train_headlines) train_bodies_cl = prepro.get_clean_data(train_bodies) train_stances_cl = prepro.get_clean_data(train_stances) # Convert labels to integer train_stances_in = prepro.convert_lable_int(train_stances_cl) # Load the test data data.set_name("test") test_stance_data = data.get_headline_body_stance() test_bodies_data = data.get_body_id_text() test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data, test_bodies_data, data_type="test") # Removing punctuation and stop words from the headline and body of test data test_headlines_cl = prepro.get_clean_data(test_headlines) test_bodies_cl = prepro.get_clean_data(test_bodies) # Set the tokenizer total_text = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl token = Tokenizer(num_words=30000) token.fit_on_texts(total_text) print('Number of Unique words: ' + str(len(token.word_index.keys()))) # Convert headline and body to sequence train_headlines_seq = token.texts_to_sequences(train_headlines_cl) train_bodies_seq = token.texts_to_sequences(train_bodies_cl) word_index = token.word_index # Padding the headline and body train_headlines_seq = pad_sequences(train_headlines_seq, maxlen=MAX_HEADLINE_LENGTH) train_bodies_seq = pad_sequences(train_bodies_seq, maxlen=int(body_length)) # Converting the labels to one hot encoder onehotencoder = OneHotEncoder() train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray() # Splitting data to train and validation train_headlines_final, headlines_val, train_bodies_final, bodies_val, train_stances_final, stances_val = \ train_test_split(train_headlines_seq, train_bodies_seq, train_stances_in, test_size=0.2, random_state=42) # Convert headline and body to sequence test_headlines_seq = token.texts_to_sequences(test_headlines_cl) test_bodies_seq = token.texts_to_sequences(test_bodies_cl) # Padding the headline and body test_headlines_seq = pad_sequences(test_headlines_seq, maxlen=MAX_HEADLINE_LENGTH) test_bodies_seq = pad_sequences(test_bodies_seq, maxlen=int(body_length)) # Getting embedding index embeddings_index = models.get_embeddings_index(GLOVE_DIR) print('Found %s word vectors.' % len(embeddings_index)) # Getting embedding matrix embedding_matrix = models.get_embedding_matrix(embedding_dim=EMBEDDING_DIMENSION, embeddings_index=embeddings_index, word_index=word_index) # Getting the model fake_nn = models.lstm_model(headline_length=MAX_HEADLINE_LENGTH, body_length=int(body_length), embedding_dim=EMBEDDING_DIMENSION, word_index=word_index, embedding_matrix=embedding_matrix, activation='relu', drop_out=0.5, numb_layers=100, cells=200) fake_nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) # Early stopping and model checkpoint early_stopping = EarlyStopping(monitor='val_loss', patience=10) bst_model_path = 'Fake_news_nlp.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) # Fitting the model fake_hist = fake_nn.fit([train_headlines_final, train_bodies_final], train_stances_final, batch_size=128, epochs=int(numb_epoch), shuffle=True, validation_data=([headlines_val, bodies_val], stances_val), callbacks=[early_stopping, model_checkpoint]) # Storing the training and validation accuracy and loss in file for plot lstm_data = [] with open(os.path.join(OBJECT_DUMP, "lstm_seperate_headline_body_" + str(body_length) + ".txt"), 'wb') as bow_hist: lstm_data.append(fake_hist.history['acc']) lstm_data.append(fake_hist.history['val_acc']) lstm_data.append(fake_hist.history['loss']) lstm_data.append(fake_hist.history['val_loss']) pickle.dump(lstm_data, bow_hist) # Predict the labels for test data result = fake_nn.predict([test_headlines_seq, test_bodies_seq], batch_size=128) # Store the results in the result file result_str = prepro.convert_lable_string(result) with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file: test_stance = csv.DictReader(read_file) with io.open(RESULT_FILE + "_" + str(body_length) + ".csv", mode='w', encoding='utf8') as write_file: writer = csv.DictWriter(write_file, fieldnames=['Headline', 'Body ID', 'Stance']) writer.writeheader() for sample, prediction in zip(test_stance, result_str): writer.writerow({'Body ID': sample['Body ID'], 'Headline': sample['Headline'], 'Stance': prediction}) # Print the Accuracy, competition score and confusion matrix print_result("fnc-1-master/competition_test_stances.csv", RESULT_FILE + "_" + str(body_length) + ".csv")
def run(ycol=0, method='gru'): ''' 运行主程序 ''' start = time.time() predict_win = 1 # 数据处理 if 'lstm' in method or 'gru' in method: timesteps = 1 elif 'rnn' in method: timesteps = 8 stock_win = dp.createWinData(price, ycol, timesteps, predict_win) train, test = dp.splitData(stock_win, ratio=0.9) scaler = StandardScaler() scaler.fit(train) train_norm = scaler.transform(train) test_norm = scaler.transform(test) #last_value = train[-1, -predict_win:] # 测试集上一个数据 #np.random.shuffle(train_norm) # 打乱训练数据 #y_tr = dp.transUpDown(train_norm[:,-predict_win:], last_value) y_tr = train_norm[:, -1] x_tr = train_norm[:, :-1] #y_te = dp.transUpDown(test_norm[:,-predict_win:], last_value) y_te = test_norm[:, -1] x_te = test_norm[:, :-1] x_tr = np.reshape( x_tr, (x_tr.shape[0], timesteps, int(x_tr.shape[1] / timesteps))) x_te = np.reshape( x_te, (x_te.shape[0], timesteps, int(x_te.shape[1] / timesteps))) y_tr = np.reshape(y_tr, (y_tr.shape[0], 1)) y_te = np.reshape(y_te, (y_te.shape[0], 1)) layers = [x_tr.shape[0], x_tr.shape[1], x_tr.shape[2], predict_win] # 模型建立 if 'lstm' in method: model = models.lstm_model(layers) elif 'rnn' in method: model = models.rnn_model(layers) elif 'gru' in method: model = models.gru_model(timesteps, 28) # 模型训练 earlystop = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min') history = LossHistory() model.fit(x_tr, y_tr, epochs=200, batch_size=16, shuffle=True, verbose=1, validation_split=0.1, callbacks=[earlystop, history]) score = model.evaluate(x_te, y_te, batch_size=16) print('test score:', score) #history.loss_plot('epoch') # 模型测试 #y_tr_pre = model.predict_classes(x_tr, batch_size = 32) y_te_pre = model.predict(x_te, batch_size=32) true = y_te * scaler.scale_[-1] + scaler.mean_[-1] predict = y_te_pre * scaler.scale_[-1] + scaler.mean_[-1] #y = np.concatenate((y_tr, y_te)) #y = y.reshape(y.shape[0]) #dp.plotCompare(y, y_tr_pre.reshape(y_tr_pre.shape[0]), # y_te_pre.reshape(y_te_pre.shape[0]), scaler) dp.plotTestCompare(true, predict) dp.errorPercentage(true, predict) print('the stock:', ycol) print('the method:', method) print('timesteps:', timesteps) # 预测结果 #print('train') #dp.plotUpDown(y_tr, y_tr_pre) #print('test') #dp.plotUpDown(y_te, y_te_pre) ''' test_len = len(y_te) diff, diff_pre = [], [] for i in range(test_len): if i == 0: diff.append((true[0] - last_value) * 100 / last_value) diff_pre.append((predict[0] - last_value) * 100 / last_value) else: diff.append((true[i] - true[i - 1]) * 100 / true[i - 1]) diff_pre.append((predict[i] - true[i - 1]) * 100 / true[i - 1]) plt.scatter(diff, diff_pre) plt.grid(True) plt.xlabel('diff') plt.ylabel('diff_pre') plt.title('Scatter Plot') plt.show() diff_label = dp.transUpDown1(diff) diff_pre_label = dp.transUpDown1(diff_pre) dp.plotUpDown(diff_label, diff_pre_label) ''' print('time cost:', time.time() - start)
TrueCount2 = secondWord.value_counts().index.tolist()[:10] #FalseCount2 = secondWordFalse2.value_counts().index.tolist()[:10] TrueCountNGram = [] for x in itertools.product(TrueCount, TrueCount2): k = ' '.join(x) TrueCountNGram.append(k) numOfWords = 2000 tokenizerPos = Tokenizer(num_words=numOfWords) gen = Generator(df1, TrueCount, TrueCountNGram, stop) #genT = Generator(dfV) X_input = gen.generate(6).__next__() epochs = 5 model = lstm_model() model.fit_generator(gen.generate(200), epochs=epochs, verbose=1, steps_per_epoch=500) model_yaml = model.to_yaml() with open("model.yaml", "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 model.save_weights("model.h5") print("Saved model to disk")
data_dir = os.path.join(init_path, 'datasets') train_test_path = os.path.join(data_dir, train_test_name) stopwords_path = os.path.join(data_dir, stopwords_name) save_dir = os.path.join(init_path, 'saved_models') if not os.path.isdir(save_dir): os.makedirs(save_dir) model_path = os.path.join(save_dir, model_name) tokenizer_path = os.path.join(save_dir, tokenizer_name) # Model last_layer_options = m.get_last_layer_options(num_classes) model = m.lstm_model(last_layer_options=last_layer_options, num_features=max_features, embedding_dim=embedding_dim, sequence_len=sequence_len, lstm_units=lstm_units, dropout_rate=dropout_rate, fc_units=fc_units) model.summary() model.compile(optimizer=optimizer, loss=last_layer_options[2], metrics=metric) # Data df_data = pd.read_csv(train_test_path, header=0) print(f'原始数据集的形状为: {df_data.shape}') df_data['review'] = df_data['review'].astype('str') print('计算每个类别评论长度的中位数、平均值、标准差:') df_data['len'] = df_data['review'].map(lambda x: len(x)) print(df_data[['cat', 'len']].groupby(by='cat').agg( ['median', 'mean', 'max', 'min', 'std']))
from models import lstm_model from models import train_model from preprocess import load_data from preprocess import load_fast_text_embedding from preprocess import get_embed_weights max_len = 50 embedding_dim = 300 tokenizer, x_train, y_train, x_test, y_test, vocab_size = load_data( 'sarcasm_v2.csv') embedding_index = load_fast_text_embedding('wiki-news-300d-1M-subword.vec') embedding_matrix = get_embed_weights(embedding_index, tokenizer) model = lstm_model(vocab_size, embedding_matrix) train_model(model, x_train, y_train, x_test, y_test) #half embeddingd # Validation Loss:1.128701367992565 Validation Accuracy:0.6840490793889286 # Validation Accuracy:68.40% (+/- 0.00%) # Train Loss:1.1293133928731907 Train Accuracy:68.40490797546013 # Test Loss:1.110706667958593 Test Accuracy:71.16564420834641
if args == "1": print('hola2') # Preprocessing of data X_train, y_train, X_val, y_val = data_prep_task1(timesteps=60) # Hyperparameters lr = 0.001 epochs = 100 n_batches = 16 # call model, compile and fit Model = lstm_model(40, dropout=True, dr=0.2, n_batches=16, input_size=X_train.shape[1], input_dimension=1, bidirectional=False) Model.compile(loss='mse', optimizer='adam', metrics=['mae']) History = Model.fit(X_train, y_train, batch_size=n_batches, epochs=epochs, validation_data=(X_val, y_val), verbose=1, shuffle=False) # plot evaluation of results #plot_model(History)