Beispiel #1
0
def run_train():
    from train import train_lstm
    train_lstm("training_set_size=37568_weeks=43.csv",
               lag=23,
               optim='rmsprop',
               epo=17,
               layer1=227,
               layer2=97)
def run_model(config, fold, fold_base=None):

    if config['model'] == 'LSTM':
        return train_lstm(config, fold)
    if config['model'] == 'BiLSTM':
        return train_bilstm(config, fold)
    if config['model'] == 'NN':
        return train_net(config, fold)
    if config['model'] == 'linear':
        return train_linear(config, fold)
    if config['model'] == 'svm':
        return train_svm(config, fold)
    if config['model'] == 'random_forest':
        return train_random_forrest(config, fold)
    if config['model'] == 'baseline':
        return train_baseline(config, fold, fold_base)
Beispiel #3
0
df = pd.read_csv(dataset_path)


test_case_name +='_'+'_'.join(map(str,[LSTM_COUNT,DROPOUT_VALUE,REGULARIZER,REG_VALUE]))
HAN_MODEL_History,HAN_MODEL,HAN_accuracy = model.train_han(df,word_index,test_case_name,LSTM_COUNT,DROPOUT_VALUE,REGULARIZER,REG_VALUE)
han_dataframe = pd.DataFrame(HAN_MODEL_History.history)
han_dataframe.to_csv(os.path.join(MODEL_FOLDER,test_case_name+'_HAN.csv'))
HAN_MODEL.save(os.path.join(MODEL_FOLDER,test_case_name+'_HAN.h5'))





with open(os.path.join(MODEL_FOLDER,test_case_name+'_HAN.json'), "w") as j_file:
    j_file.write(HAN_MODEL.to_json())
LSTM_Model_History,LSTM_Model,LSTM_accuracy = model.train_lstm(df,word_index,test_case_name,LSTM_COUNT)

LSTM_Model.save(os.path.join(MODEL_FOLDER,test_case_name+'_LSTM.h5'))
lstm_dataframe = pd.DataFrame(LSTM_Model_History.history)
lstm_dataframe.to_csv(os.path.join(MODEL_FOLDER,test_case_name+'_LSTM.csv'))
with open(os.path.join(MODEL_FOLDER,test_case_name+'_LSTM.json'), "w") as j_file:
    j_file.write(LSTM_Model.to_json())


plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'val_loss','Epochs','Validation Loss','Validation_Loss')
plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'categorical_accuracy','Epochs','Accuracy','Accuracy')
plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'loss','Epochs','Loss','Loss')
plot_models([LSTM_Model_History,HAN_MODEL_History],['LSTM','HAN'],'val_categorical_accuracy','Epochs','Validation Accuracy','Validation_Accuracy')

# # # plot_model(LSTM_Model, to_file=os.path.join(MODEL_FOLDER,test_case_name+'_Model_LSTM.png'), show_shapes=True, show_layer_names=True)
# # # plot_model(HAN_MODEL, to_file=os.path.join(MODEL_FOLDER,test_case_name+'_Model_LSTM.png'), show_shapes=True, show_layer_names=True)
Beispiel #4
0
def run_model(name, context, conf, double_input, use_elmo=False, save_predictions=False, save_model=False):
    """
    Runs the given model 'name' for the given 'context' and agreement level 'conf'. If double_input is True, runs the combined model using context comment text. Optionally saves the trained model & its vocabulary, and predictions.
    Allowed names: lstm | bilstm | stacked_bilstm | cnn | dense_lstm | dense_bilstm | dense_stacked_bilstm | dense_cnn | nli_cnn | bert | dense_bert
    
    If use_elmo=True, uses ELMo's pre-trained language model for embeddings.

    """
    if use_elmo:
        token_indexer = ELMoTokenCharactersIndexer() # token indexer is responsible for mapping tokens to integers: this makes sure that the mapping is consistent with what was used in the original ELMo training.
    elif name == 'bert':
        global bert_token_indexer
        bert_token_indexer = PretrainedBertIndexer(pretrained_model=BERT_MODEL, do_lowercase=True)
    else:
        token_indexer = SingleIdTokenIndexer()

    if name == 'bert': # BERT uses a special wordpiece tokenizer
        reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input,
                                                      tokenizer=tokenizer_bert, token_indexers={"tokens": bert_token_indexer},
                                                      label_cols=LABEL_COLS)
    else:
        reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input, tokenizer=tokenizer,
                                                      token_indexers={"tokens": token_indexer}, label_cols=LABEL_COLS)


    map_reply_id_pred_probability = {}; n_epochs = []
    f1s, AUROCs, weighted_f1s, precision_s, recall_s, accuracies, AUPRCs = [], [], [], [], [], [], []

    for fold_number in range(1,6): # 5-fold cross validation
        train_fname = 'train_data_fold_'+str(fold_number)+'_OneHot.csv'
        val_fname = 'val_data_fold_'+str(fold_number)+'_OneHot.csv'
        test_fname = 'test_data_fold_'+str(fold_number)+'_OneHot.csv'

        train_dataset = reader.read(file_path=DATA_ROOT / conf / train_fname)
        validation_dataset = reader.read(file_path=DATA_ROOT / conf / val_fname)
        test_dataset = reader.read(file_path=DATA_ROOT / conf / test_fname)
        print("\n#####################################################\n", double_input, context, conf, name, len(train_dataset), len(validation_dataset), len(test_dataset))

        # Train model:
        if name == 'lstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                                num_layers=1, bidirectional=False, use_elmo=use_elmo, double_input=double_input)
        elif name == 'dense_lstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                                col_name=context, num_layers=1, bidirectional=False, use_elmo=use_elmo,
                                                double_input=double_input)
        elif name == 'bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                                num_layers=1, bidirectional=True, use_elmo=use_elmo, double_input=double_input)
        elif name == 'dense_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                                col_name=context, num_layers=1, bidirectional=True, use_elmo=use_elmo,
                                                double_input=double_input)
        elif name == 'stacked_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                                num_layers=2, bidirectional=True, use_elmo=use_elmo, double_input=double_input)
        elif name == 'dense_stacked_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                                col_name=context, num_layers=2, bidirectional=True, use_elmo=use_elmo,
                                                double_input=double_input)
        elif name == 'cnn':
            if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence
            else: filter_sizes = (2,)
            model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                               num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo,
                                               double_input=double_input)
        elif name == 'dense_cnn':
            if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence
            else: filter_sizes = (2,)
            model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                               col_name=context, num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo,
                                               double_input=double_input)      
        elif name == 'nli_cnn':
            if double_input == False:
                print("Error: NLI-inspired architecture only accepts double-input.")
                return [None]*9
            filter_sizes = (2,3)
            model, vocab, ep = train.train_nli(train_dataset, validation_dataset, BATCH_SIZE, use_elmo=use_elmo,
                                               num_filters=100, filter_sizes=filter_sizes)
        elif name == 'bert':
            model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL,
                                                dense_vector=False, double_input=double_input)
        elif name == 'dense_bert':
            model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL, 
                                                dense_vector=True, col_name=context, double_input=double_input)
        else:
            sys.exit("'name' not valid")
            
        n_epochs.append(ep) # keep track of number of actual training epochs for each fold
        
        # Predict and evaluate model on test set:
        preds = evaluate.make_predictions(model, vocab, test_dataset, BATCH_SIZE, use_gpu=False) # NOTE: preds is of shape (number of samples, 2) - the columns represent the probabilities for the two classes in order ['yes_unp', 'not_unp']
        f1, auroc, w_f1, precision, recall, acc, auprc = evaluate.compute_metrics(preds, test_dataset)
        
        if save_predictions: # save predictions for error analysis
            replyid_pred = evaluate.map_id_prediction(preds, test_dataset)
            if set(replyid_pred.keys()).intersection(set(map_reply_id_pred_probability.keys())) != set(): # sanity check
                sys.exit("Error: There is overlap in Test IDs across folds.")
            map_reply_id_pred_probability.update(replyid_pred)
        
        if save_model: # save the model weights and vocabulary
            with open('./tmp/'+name+'_model_conf_'+conf.split('-')[1]+'_fold_'+str(fold_number)+'.th', 'wb') as f:
                torch.save(model.state_dict(), f)
            vocab.save_to_files("./tmp/"+name+"_vocabulary_"+conf.split('-')[1]+"_fold_"+str(fold_number))

        print("\nFold #{} | F1 = {} | AUROC = {} | AUPRC = {}".format(fold_number, f1, auroc, auprc))

        f1s.append(f1); AUROCs.append(auroc); weighted_f1s.append(w_f1); precision_s.append(precision); 
        recall_s.append(recall); accuracies.append(acc); AUPRCs.append(auprc)

    mean_f1 = np.array(f1s).mean(); mean_auroc = np.array(AUROCs).mean(); mean_weighted_f1 = np.array(weighted_f1s).mean(); 
    mean_precision = np.array(precision_s).mean(); mean_recall = np.array(recall_s).mean(); mean_accuracy = np.array(accuracies).mean(); mean_auprc = np.array(AUPRCs).mean()

    print("Total predictions: {} | Save Predictions: {}".format(len(map_reply_id_pred_probability), save_predictions))
    
    return mean_f1, mean_auroc, mean_weighted_f1, mean_precision, mean_recall, mean_accuracy, mean_auprc, map_reply_id_pred_probability, n_epochs
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, default='/mnt/aoni04/katayama/DATA2020/')
    parser.add_argument('-m', '--mode', type=int,
                        help='mode is 0(spec) or 1(LLD)')
    parser.add_argument('-o', '--out', type=str, default='./SPEC')
    parser.add_argument('-e', '--epoch', type=int, default=100)
    parser.add_argument('-r', '--resume', type=str, default=True)
    parser.add_argument('--hang', type=str, default=False)

    args = parser.parse_args()
    import datetime
    now = datetime.datetime.now()
    print('{0:%Y%m%d%H%M}'.format(now))
    out = os.path.join(args.out, '{0:%Y%m%d%H%M}'.format(now))
    os.makedirs(out, exist_ok=True)

    dense_flag = False
    train_id = 89

    if args.mode == 0:
        from SPEC.utils import setup
        df_list = setup(PATH=args.input, dense_flag=dense_flag)
    else:
        from LLD.utils import setup
        df_list, lld_list = setup(PATH=args.input, dense_flag=dense_flag)
        lld_train = lld_list[13:train_id]
        lld_val = lld_list[train_id:]
        lld_dict = {'train': lld_train, 'val': lld_val}

    # 連結せずに 会話毎に list でもつ
    df_train = df_list[13:train_id]
    feature = []
    df_val = df_list[train_id:]
    feature_val = []
    df_dict = {'train': df_train, 'val': df_val}

    dataloaders_dict = {"train": feature, "val": feature_val}

    for phase in df_dict.keys():
        df = df_dict[phase]
        feature = dataloaders_dict[phase]

        for i in range(len(df)):
            if args.mode == 0:
                x = df[i].iloc[:, -512:-256].values
                x_b = df[i].iloc[:, -256:].values
            elif args.mode == 1:
                lld = lld_dict[phase]
                x = lld[i].iloc[:, :114].values
                x_b = lld[i].iloc[:, 114:].values
                x = x.reshape(-1, 10, 114)
                x_b = x_b.reshape(-1, 10, 114)
            u = hang_over(1.0 - df[i]['utter_A'].values, flag=args.hang)
            feature.append((x, u))
            u = hang_over(1.0 - df[i]['utter_B'].values, flag=args.hang)
            feature.append((x_b, u))        
        
    net = TimeActionPredict(
                input_size=x.shape[-1],
                hidden_size=64,
                mode=args.mode
    )                
    print('Model :', net.__class__.__name__)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    for name, param in net.named_parameters():
        if 'fc' in name or 'lstm' in name:
            param.requires_grad = True
            print("勾配計算あり。学習する:", name)
        else:
            param.requires_grad = False
            print("勾配計算あり。学習しない:", name)
      
    train_lstm(net=net,
               dataloaders_dict=dataloaders_dict,
               criterion=criterion,
               optimizer=optimizer,
               num_epochs=args.epoch,
               output=out,
               resume=args.resume)
Beispiel #6
0
def run_model(name, use_elmo=False, save_predictions=False, save_model=False):
    """
    Trains the given deep learning model on train set, and evaluates on test set.
    
    Parameters
    ----------
    name: str
        name of the deep learning model to be run: lstm | bilstm | stacked_bilstm | cnn | bert
    use_elmo: bool
        use ELMo embeddings if True | GloVe embeddings if False
    save_predictions: bool
        If True, stores and returns the predicted probabilities mapped to sentence ID
    save_model: bool
        If True, saves the trained model along with its vocabulary
        
    Returns
    -------
    F1-score, Precision, Recall, Accuracy, Area Under Precision-Recall Curve on the test set; dictionary mapping predictions to ID, and number of training epochs for each fold.
    """
    # token_indexer maps tokens to integers; using special built-in indexers for ELMo and BERT to ensure mapping is consistent with the original models
    if use_elmo:
        token_indexer = ELMoTokenCharactersIndexer()
    elif name == 'bert':
        global bert_token_indexer
        bert_token_indexer = PretrainedBertIndexer(pretrained_model=BERT_MODEL,
                                                   do_lowercase=True)
    else:
        token_indexer = SingleIdTokenIndexer()

    if name == 'bert':  # BERT uses a special wordpiece tokenizer
        reader = data_reader.GeneralizationDatasetReader(
            tokenizer=tokenizer_bert,
            token_indexers={"tokens": bert_token_indexer},
            label_cols=LABEL_COLS)
    else:
        reader = data_reader.GeneralizationDatasetReader(
            tokenizer=tokenizer,
            token_indexers={"tokens": token_indexer},
            label_cols=LABEL_COLS)

    map_id_pred_probability = {}  # used if save_predictions is True
    f1s, precision_s, recall_s, accuracies, AUPRCs, n_epochs = [], [], [], [], [], []

    for fold_number in range(1, 4):  # 3-fold cross validation
        train_fname = 'train_data_fold_' + str(fold_number) + '.csv'
        val_fname = 'val_data_fold_' + str(fold_number) + '.csv'
        test_fname = 'test_data_fold_' + str(fold_number) + '.csv'

        train_dataset = reader.read(file_path=DATA_ROOT / train_fname)
        validation_dataset = reader.read(file_path=DATA_ROOT / val_fname)
        test_dataset = reader.read(file_path=DATA_ROOT / test_fname)
        #         print("\n##################################\n", name, len(train_dataset), len(validation_dataset), len(test_dataset))

        # Train the model:
        if name == 'lstm':
            model, vocab, ep = train.train_lstm(train_dataset,
                                                validation_dataset,
                                                BATCH_SIZE,
                                                num_layers=1,
                                                bidirectional=False,
                                                use_elmo=use_elmo)
        elif name == 'bilstm':
            model, vocab, ep = train.train_lstm(train_dataset,
                                                validation_dataset,
                                                BATCH_SIZE,
                                                num_layers=1,
                                                bidirectional=True,
                                                use_elmo=use_elmo)
        elif name == 'stacked_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset,
                                                validation_dataset,
                                                BATCH_SIZE,
                                                num_layers=2,
                                                bidirectional=True,
                                                use_elmo=use_elmo)
        elif name == 'cnn':
            model, vocab, ep = train.train_cnn(train_dataset,
                                               validation_dataset,
                                               BATCH_SIZE,
                                               num_filters=100,
                                               filter_sizes=(2, 3, 4, 5),
                                               use_elmo=use_elmo)
        elif name == 'bert':
            model, vocab, ep = train.train_bert(train_dataset,
                                                validation_dataset,
                                                BATCH_SIZE,
                                                pretrained_model=BERT_MODEL)
        else:
            sys.exit("'name' not valid")

        n_epochs.append(
            ep)  # keep track of number of actual training epochs for each fold

        # Predict and evaluate the model on test set:
        preds = evaluate.make_predictions(
            model, vocab, test_dataset, BATCH_SIZE
        )  # Note that 'preds' is of the shape (number of samples, 2) - the columns represent the probabilities for the two classes ['generalization', 'neutral']
        f1, precision, recall, acc, auprc = evaluate.compute_metrics(
            preds, test_dataset)

        if save_predictions:
            id_pred = evaluate.map_id_prediction(preds, test_dataset)
            if set(id_pred.keys()).intersection(
                    set(map_id_pred_probability.keys())) != set(
                    ):  # sanity check
                sys.exit(
                    "Error: There is overlap in test set IDs across folds.")
            map_id_pred_probability.update(id_pred)

        if save_model:  # save the model weights and vocabulary
            with open(
                    './tmp/' + name + '_model' + '_fold_' + str(fold_number) +
                    '.th', 'wb') as f:
                torch.save(model.state_dict(), f)
            vocab.save_to_files("./tmp/" + name + "_vocabulary_fold_" +
                                str(fold_number))

        print("\nFold #{} | F1 = {}".format(fold_number, f1))
        f1s.append(f1)
        precision_s.append(precision)
        recall_s.append(recall)
        accuracies.append(acc)
        AUPRCs.append(auprc)

    mean_f1 = np.array(f1s).mean()
    mean_precision = np.array(precision_s).mean()
    mean_recall = np.array(recall_s).mean()
    mean_accuracy = np.array(accuracies).mean()
    mean_auprc = np.array(AUPRCs).mean()

    print("Total # predictions: {} | Saving Predictions = {}".format(
        len(map_id_pred_probability), save_predictions))

    return mean_f1, mean_precision, mean_recall, mean_accuracy, mean_auprc, map_id_pred_probability, n_epochs
    reader = data_reader.NovelDatasetReader(
        scenario=case,
        augmentation=augmentation,
        tokenizer=tokenizer,
        token_indexers={"tokens": token_indexer})
    train_dataset = reader.read(file_path='train')
    test_dataset = reader.read(file_path='test')
    print("Train: ", len(train_dataset), "| Test:", len(test_dataset))

    print("\n#####################################################\n")

    # Train model:
    if name == 'lstm':
        model, vocab, ep = train.train_lstm(train_dataset,
                                            BATCH_SIZE,
                                            epochs=15,
                                            num_layers=1,
                                            bidirectional=False,
                                            use_elmo=use_elmo)
    elif name == 'bilstm':
        model, vocab, ep = train.train_lstm(train_dataset,
                                            BATCH_SIZE,
                                            epochs=15,
                                            num_layers=1,
                                            bidirectional=True,
                                            use_elmo=use_elmo)
    elif name == 'stacked_bilstm':
        model, vocab, ep = train.train_lstm(train_dataset,
                                            BATCH_SIZE,
                                            epochs=15,
                                            num_layers=2,
                                            bidirectional=True,
Beispiel #8
0
    # 处理dev数据
    d_data_list_node = dp.datadeal('data/raw.clean.dev', is_traindata=False)
    # 处理test数据
    test_data_node = dp.datadeal('data/raw.clean.test', is_traindata=False)

    if args.out_word_v:
        args.word_embed = out_word_vec.add_word_v(data_v)

    args.embed_num = len(data_v)
    args.class_num = len(lab_v)

    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        if attr == 'word_embed':
            continue
        print("\t{}={}".format(attr.upper(), value))

    return t_data_list_node, data_v, d_data_list_node, lab_v


if __name__ == "__main__":

    train_data_list_node, data_voc, dev_data_list_node, lab_voc = loaddata()
    lstm = model_LSTM.LSTM(args)

    try:
        train.train_lstm(train_data_list_node, data_voc, dev_data_list_node,
                         lab_voc, lstm, args)
    except KeyboardInterrupt:
        print('\nstop by human!!!')