def run_train(): from train import train_lstm train_lstm("training_set_size=37568_weeks=43.csv", lag=23, optim='rmsprop', epo=17, layer1=227, layer2=97)
def run_model(config, fold, fold_base=None): if config['model'] == 'LSTM': return train_lstm(config, fold) if config['model'] == 'BiLSTM': return train_bilstm(config, fold) if config['model'] == 'NN': return train_net(config, fold) if config['model'] == 'linear': return train_linear(config, fold) if config['model'] == 'svm': return train_svm(config, fold) if config['model'] == 'random_forest': return train_random_forrest(config, fold) if config['model'] == 'baseline': return train_baseline(config, fold, fold_base)
df = pd.read_csv(dataset_path) test_case_name +='_'+'_'.join(map(str,[LSTM_COUNT,DROPOUT_VALUE,REGULARIZER,REG_VALUE])) HAN_MODEL_History,HAN_MODEL,HAN_accuracy = model.train_han(df,word_index,test_case_name,LSTM_COUNT,DROPOUT_VALUE,REGULARIZER,REG_VALUE) han_dataframe = pd.DataFrame(HAN_MODEL_History.history) han_dataframe.to_csv(os.path.join(MODEL_FOLDER,test_case_name+'_HAN.csv')) HAN_MODEL.save(os.path.join(MODEL_FOLDER,test_case_name+'_HAN.h5')) with open(os.path.join(MODEL_FOLDER,test_case_name+'_HAN.json'), "w") as j_file: j_file.write(HAN_MODEL.to_json()) LSTM_Model_History,LSTM_Model,LSTM_accuracy = model.train_lstm(df,word_index,test_case_name,LSTM_COUNT) LSTM_Model.save(os.path.join(MODEL_FOLDER,test_case_name+'_LSTM.h5')) lstm_dataframe = pd.DataFrame(LSTM_Model_History.history) lstm_dataframe.to_csv(os.path.join(MODEL_FOLDER,test_case_name+'_LSTM.csv')) with open(os.path.join(MODEL_FOLDER,test_case_name+'_LSTM.json'), "w") as j_file: j_file.write(LSTM_Model.to_json()) plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'val_loss','Epochs','Validation Loss','Validation_Loss') plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'categorical_accuracy','Epochs','Accuracy','Accuracy') plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'loss','Epochs','Loss','Loss') plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'val_categorical_accuracy','Epochs','Validation Accuracy','Validation_Accuracy') # # # plot_model(LSTM_Model, to_file=os.path.join(MODEL_FOLDER,test_case_name+'_Model_LSTM.png'), show_shapes=True, show_layer_names=True) # # # plot_model(HAN_MODEL, to_file=os.path.join(MODEL_FOLDER,test_case_name+'_Model_LSTM.png'), show_shapes=True, show_layer_names=True)
def run_model(name, context, conf, double_input, use_elmo=False, save_predictions=False, save_model=False): """ Runs the given model 'name' for the given 'context' and agreement level 'conf'. If double_input is True, runs the combined model using context comment text. Optionally saves the trained model & its vocabulary, and predictions. Allowed names: lstm | bilstm | stacked_bilstm | cnn | dense_lstm | dense_bilstm | dense_stacked_bilstm | dense_cnn | nli_cnn | bert | dense_bert If use_elmo=True, uses ELMo's pre-trained language model for embeddings. """ if use_elmo: token_indexer = ELMoTokenCharactersIndexer() # token indexer is responsible for mapping tokens to integers: this makes sure that the mapping is consistent with what was used in the original ELMo training. elif name == 'bert': global bert_token_indexer bert_token_indexer = PretrainedBertIndexer(pretrained_model=BERT_MODEL, do_lowercase=True) else: token_indexer = SingleIdTokenIndexer() if name == 'bert': # BERT uses a special wordpiece tokenizer reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input, tokenizer=tokenizer_bert, token_indexers={"tokens": bert_token_indexer}, label_cols=LABEL_COLS) else: reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input, tokenizer=tokenizer, token_indexers={"tokens": token_indexer}, label_cols=LABEL_COLS) map_reply_id_pred_probability = {}; n_epochs = [] f1s, AUROCs, weighted_f1s, precision_s, recall_s, accuracies, AUPRCs = [], [], [], [], [], [], [] for fold_number in range(1,6): # 5-fold cross validation train_fname = 'train_data_fold_'+str(fold_number)+'_OneHot.csv' val_fname = 'val_data_fold_'+str(fold_number)+'_OneHot.csv' test_fname = 'test_data_fold_'+str(fold_number)+'_OneHot.csv' train_dataset = reader.read(file_path=DATA_ROOT / conf / train_fname) validation_dataset = reader.read(file_path=DATA_ROOT / conf / val_fname) test_dataset = reader.read(file_path=DATA_ROOT / conf / test_fname) print("\n#####################################################\n", double_input, context, conf, name, len(train_dataset), len(validation_dataset), len(test_dataset)) # Train model: if name == 'lstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_layers=1, bidirectional=False, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_lstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_layers=1, bidirectional=False, use_elmo=use_elmo, double_input=double_input) elif name == 'bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_layers=1, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_layers=1, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'stacked_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_layers=2, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_stacked_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_layers=2, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'cnn': if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence else: filter_sizes = (2,) model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_cnn': if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence else: filter_sizes = (2,) model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo, double_input=double_input) elif name == 'nli_cnn': if double_input == False: print("Error: NLI-inspired architecture only accepts double-input.") return [None]*9 filter_sizes = (2,3) model, vocab, ep = train.train_nli(train_dataset, validation_dataset, BATCH_SIZE, use_elmo=use_elmo, num_filters=100, filter_sizes=filter_sizes) elif name == 'bert': model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL, dense_vector=False, double_input=double_input) elif name == 'dense_bert': model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL, dense_vector=True, col_name=context, double_input=double_input) else: sys.exit("'name' not valid") n_epochs.append(ep) # keep track of number of actual training epochs for each fold # Predict and evaluate model on test set: preds = evaluate.make_predictions(model, vocab, test_dataset, BATCH_SIZE, use_gpu=False) # NOTE: preds is of shape (number of samples, 2) - the columns represent the probabilities for the two classes in order ['yes_unp', 'not_unp'] f1, auroc, w_f1, precision, recall, acc, auprc = evaluate.compute_metrics(preds, test_dataset) if save_predictions: # save predictions for error analysis replyid_pred = evaluate.map_id_prediction(preds, test_dataset) if set(replyid_pred.keys()).intersection(set(map_reply_id_pred_probability.keys())) != set(): # sanity check sys.exit("Error: There is overlap in Test IDs across folds.") map_reply_id_pred_probability.update(replyid_pred) if save_model: # save the model weights and vocabulary with open('./tmp/'+name+'_model_conf_'+conf.split('-')[1]+'_fold_'+str(fold_number)+'.th', 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("./tmp/"+name+"_vocabulary_"+conf.split('-')[1]+"_fold_"+str(fold_number)) print("\nFold #{} | F1 = {} | AUROC = {} | AUPRC = {}".format(fold_number, f1, auroc, auprc)) f1s.append(f1); AUROCs.append(auroc); weighted_f1s.append(w_f1); precision_s.append(precision); recall_s.append(recall); accuracies.append(acc); AUPRCs.append(auprc) mean_f1 = np.array(f1s).mean(); mean_auroc = np.array(AUROCs).mean(); mean_weighted_f1 = np.array(weighted_f1s).mean(); mean_precision = np.array(precision_s).mean(); mean_recall = np.array(recall_s).mean(); mean_accuracy = np.array(accuracies).mean(); mean_auprc = np.array(AUPRCs).mean() print("Total predictions: {} | Save Predictions: {}".format(len(map_reply_id_pred_probability), save_predictions)) return mean_f1, mean_auroc, mean_weighted_f1, mean_precision, mean_recall, mean_accuracy, mean_auprc, map_reply_id_pred_probability, n_epochs
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str, default='/mnt/aoni04/katayama/DATA2020/') parser.add_argument('-m', '--mode', type=int, help='mode is 0(spec) or 1(LLD)') parser.add_argument('-o', '--out', type=str, default='./SPEC') parser.add_argument('-e', '--epoch', type=int, default=100) parser.add_argument('-r', '--resume', type=str, default=True) parser.add_argument('--hang', type=str, default=False) args = parser.parse_args() import datetime now = datetime.datetime.now() print('{0:%Y%m%d%H%M}'.format(now)) out = os.path.join(args.out, '{0:%Y%m%d%H%M}'.format(now)) os.makedirs(out, exist_ok=True) dense_flag = False train_id = 89 if args.mode == 0: from SPEC.utils import setup df_list = setup(PATH=args.input, dense_flag=dense_flag) else: from LLD.utils import setup df_list, lld_list = setup(PATH=args.input, dense_flag=dense_flag) lld_train = lld_list[13:train_id] lld_val = lld_list[train_id:] lld_dict = {'train': lld_train, 'val': lld_val} # 連結せずに 会話毎に list でもつ df_train = df_list[13:train_id] feature = [] df_val = df_list[train_id:] feature_val = [] df_dict = {'train': df_train, 'val': df_val} dataloaders_dict = {"train": feature, "val": feature_val} for phase in df_dict.keys(): df = df_dict[phase] feature = dataloaders_dict[phase] for i in range(len(df)): if args.mode == 0: x = df[i].iloc[:, -512:-256].values x_b = df[i].iloc[:, -256:].values elif args.mode == 1: lld = lld_dict[phase] x = lld[i].iloc[:, :114].values x_b = lld[i].iloc[:, 114:].values x = x.reshape(-1, 10, 114) x_b = x_b.reshape(-1, 10, 114) u = hang_over(1.0 - df[i]['utter_A'].values, flag=args.hang) feature.append((x, u)) u = hang_over(1.0 - df[i]['utter_B'].values, flag=args.hang) feature.append((x_b, u)) net = TimeActionPredict( input_size=x.shape[-1], hidden_size=64, mode=args.mode ) print('Model :', net.__class__.__name__) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=0.001) for name, param in net.named_parameters(): if 'fc' in name or 'lstm' in name: param.requires_grad = True print("勾配計算あり。学習する:", name) else: param.requires_grad = False print("勾配計算あり。学習しない:", name) train_lstm(net=net, dataloaders_dict=dataloaders_dict, criterion=criterion, optimizer=optimizer, num_epochs=args.epoch, output=out, resume=args.resume)
def run_model(name, use_elmo=False, save_predictions=False, save_model=False): """ Trains the given deep learning model on train set, and evaluates on test set. Parameters ---------- name: str name of the deep learning model to be run: lstm | bilstm | stacked_bilstm | cnn | bert use_elmo: bool use ELMo embeddings if True | GloVe embeddings if False save_predictions: bool If True, stores and returns the predicted probabilities mapped to sentence ID save_model: bool If True, saves the trained model along with its vocabulary Returns ------- F1-score, Precision, Recall, Accuracy, Area Under Precision-Recall Curve on the test set; dictionary mapping predictions to ID, and number of training epochs for each fold. """ # token_indexer maps tokens to integers; using special built-in indexers for ELMo and BERT to ensure mapping is consistent with the original models if use_elmo: token_indexer = ELMoTokenCharactersIndexer() elif name == 'bert': global bert_token_indexer bert_token_indexer = PretrainedBertIndexer(pretrained_model=BERT_MODEL, do_lowercase=True) else: token_indexer = SingleIdTokenIndexer() if name == 'bert': # BERT uses a special wordpiece tokenizer reader = data_reader.GeneralizationDatasetReader( tokenizer=tokenizer_bert, token_indexers={"tokens": bert_token_indexer}, label_cols=LABEL_COLS) else: reader = data_reader.GeneralizationDatasetReader( tokenizer=tokenizer, token_indexers={"tokens": token_indexer}, label_cols=LABEL_COLS) map_id_pred_probability = {} # used if save_predictions is True f1s, precision_s, recall_s, accuracies, AUPRCs, n_epochs = [], [], [], [], [], [] for fold_number in range(1, 4): # 3-fold cross validation train_fname = 'train_data_fold_' + str(fold_number) + '.csv' val_fname = 'val_data_fold_' + str(fold_number) + '.csv' test_fname = 'test_data_fold_' + str(fold_number) + '.csv' train_dataset = reader.read(file_path=DATA_ROOT / train_fname) validation_dataset = reader.read(file_path=DATA_ROOT / val_fname) test_dataset = reader.read(file_path=DATA_ROOT / test_fname) # print("\n##################################\n", name, len(train_dataset), len(validation_dataset), len(test_dataset)) # Train the model: if name == 'lstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, num_layers=1, bidirectional=False, use_elmo=use_elmo) elif name == 'bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, num_layers=1, bidirectional=True, use_elmo=use_elmo) elif name == 'stacked_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, num_layers=2, bidirectional=True, use_elmo=use_elmo) elif name == 'cnn': model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, num_filters=100, filter_sizes=(2, 3, 4, 5), use_elmo=use_elmo) elif name == 'bert': model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL) else: sys.exit("'name' not valid") n_epochs.append( ep) # keep track of number of actual training epochs for each fold # Predict and evaluate the model on test set: preds = evaluate.make_predictions( model, vocab, test_dataset, BATCH_SIZE ) # Note that 'preds' is of the shape (number of samples, 2) - the columns represent the probabilities for the two classes ['generalization', 'neutral'] f1, precision, recall, acc, auprc = evaluate.compute_metrics( preds, test_dataset) if save_predictions: id_pred = evaluate.map_id_prediction(preds, test_dataset) if set(id_pred.keys()).intersection( set(map_id_pred_probability.keys())) != set( ): # sanity check sys.exit( "Error: There is overlap in test set IDs across folds.") map_id_pred_probability.update(id_pred) if save_model: # save the model weights and vocabulary with open( './tmp/' + name + '_model' + '_fold_' + str(fold_number) + '.th', 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("./tmp/" + name + "_vocabulary_fold_" + str(fold_number)) print("\nFold #{} | F1 = {}".format(fold_number, f1)) f1s.append(f1) precision_s.append(precision) recall_s.append(recall) accuracies.append(acc) AUPRCs.append(auprc) mean_f1 = np.array(f1s).mean() mean_precision = np.array(precision_s).mean() mean_recall = np.array(recall_s).mean() mean_accuracy = np.array(accuracies).mean() mean_auprc = np.array(AUPRCs).mean() print("Total # predictions: {} | Saving Predictions = {}".format( len(map_id_pred_probability), save_predictions)) return mean_f1, mean_precision, mean_recall, mean_accuracy, mean_auprc, map_id_pred_probability, n_epochs
reader = data_reader.NovelDatasetReader( scenario=case, augmentation=augmentation, tokenizer=tokenizer, token_indexers={"tokens": token_indexer}) train_dataset = reader.read(file_path='train') test_dataset = reader.read(file_path='test') print("Train: ", len(train_dataset), "| Test:", len(test_dataset)) print("\n#####################################################\n") # Train model: if name == 'lstm': model, vocab, ep = train.train_lstm(train_dataset, BATCH_SIZE, epochs=15, num_layers=1, bidirectional=False, use_elmo=use_elmo) elif name == 'bilstm': model, vocab, ep = train.train_lstm(train_dataset, BATCH_SIZE, epochs=15, num_layers=1, bidirectional=True, use_elmo=use_elmo) elif name == 'stacked_bilstm': model, vocab, ep = train.train_lstm(train_dataset, BATCH_SIZE, epochs=15, num_layers=2, bidirectional=True,
# 处理dev数据 d_data_list_node = dp.datadeal('data/raw.clean.dev', is_traindata=False) # 处理test数据 test_data_node = dp.datadeal('data/raw.clean.test', is_traindata=False) if args.out_word_v: args.word_embed = out_word_vec.add_word_v(data_v) args.embed_num = len(data_v) args.class_num = len(lab_v) print("\nParameters:") for attr, value in sorted(args.__dict__.items()): if attr == 'word_embed': continue print("\t{}={}".format(attr.upper(), value)) return t_data_list_node, data_v, d_data_list_node, lab_v if __name__ == "__main__": train_data_list_node, data_voc, dev_data_list_node, lab_voc = loaddata() lstm = model_LSTM.LSTM(args) try: train.train_lstm(train_data_list_node, data_voc, dev_data_list_node, lab_voc, lstm, args) except KeyboardInterrupt: print('\nstop by human!!!')