def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(0) best_dev = -10 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = data.model_dir + '.' + str(idx) + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _, _ = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc)) gc.collect()
def train(data, decode_data, args): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_score = -10 best_epoch = 0 for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) dev_finish = time.time() if data.eval_type == "CONLLU": lookup = processing.dump_into_lookup(data.dev_gold) # save the model model_name = data.model_dir + ".model" torch.save(model.state_dict(), model_name) # decode the saved model decode_data.load(decode_data.dset_dir) decode_data.read_config(args.decode) decode_data.generate_instance('raw') decode_results, pred_scores, probs, _, _ = load_model_decode( decode_data, 'raw') decode_data.write_decoded_results2(decode_results, probs, 'raw') test_finish = time.time() gc.collect() dev_enc = l.Encoding(data.encoding, data.postag_type) dict_encoded, all_sent, text = dev_enc.encode(data.dev_gold) processing.write_to_conllu(dict_encoded, data.dev_enc_dep2label, 0) diction, words = dev_enc.decode(decode_data.output_nn, data.encoding, all_sent) processing.write_to_conllu(diction, "final.conllu", text) #EVALUATE AND SAVE THE MODEL WITH THE HIGHEST UAS SCORE ON THE DEV SET if data.eval_type == "CONLLU": processing.merge_lookup("final.conllu", lookup) score = -10.0 if data.eval_type == 'CONLL': call( "perl eval07.pl -p -q -g " + data.dev_gold + " -s final.conllu | grep Unlabeled | cut -d ' ' -f 12 > score.txt", shell=True) with open('score.txt') as f: score = float(f.read()) elif data.eval_type == 'CONLLU': subparser = argparse.ArgumentParser() subparser.add_argument( "gold_file", type=str, help="Name of the CoNLL-U file with the gold data.") subparser.add_argument( "system_file", type=str, help="Name of the CoNLL-U file with the predicted data.") subparser.add_argument("--verbose", "-v", default=False, action="store_true", help="Print all metrics.") subparser.add_argument( "--counts", "-c", default=False, action="store_true", help= "Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics." ) subargs = subparser.parse_args([data.dev_gold, "final.conllu"]) # Evaluate evaluation = conll18.evaluate_wrapper(subargs) score = 100 * evaluation["UAS"].f1 else: print('Invalid option for --eval-type') exit() if score > best_score: best_score = score print("Exceeds. New best score: " + repr(score)) best_epoch = idx best_model_name = data.model_dir + "_best.model" torch.save(model.state_dict(), best_model_name) print("Best score sofar " + repr(best_score)) print("Best epoch " + repr(best_epoch)) """
def train(data): print "Training model..." data.show_data_summary() save_data_name = data.model_dir +".dset" data.save(save_data_name) model = SeqModel(data) if data.pretrained_model is not None: model_dict = model.state_dict() #We load the weights for the layers that we have pretrained (e.g. for language modeling) pretrained_dict = torch.load(data.pretrained_model) pretrained_dict = {k: v for k, v in pretrained_dict.items() if data.pretrained_part == data.PRETRAINED_ALL or (data.pretrained_part == data.PRETRAINED_LSTMS and "hidden2tagList" not in k)} # We overwrite entries in the existing state dict model_dict.update(pretrained_dict) # We load the new state dict model.load_state_dict(model_dict) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": #optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum) optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s"%(data.optimizer)) exit(0) best_dev = -10 range_valid_tasks = range(data.HP_tasks) for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" %(idx,data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 sample_loss = {idtask: 0 for idtask in range(data.HP_tasks)} right_token = {idtask: 0 for idtask in range(data.HP_tasks)} whole_token = {idtask: 0 for idtask in range(data.HP_tasks)} random.shuffle(data.train_Ids) #We get the indexes where are the samples of each (shuffled) treebank if data.disjoint: treebank_indexes = {} for idxsample, sample in enumerate(data.train_Ids): if sample[-1] not in treebank_indexes: treebank_indexes[sample[-1]] = [] treebank_indexes[sample[-1]].append(idxsample) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num//batch_size+1 if data.disjoint: tb_idxs = {tb:(0,batch_size) for tb in treebank_indexes} for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num if data.disjoint: eligible_treebanks = [t for t in treebank_indexes if tb_idxs[t][0] < len(treebank_indexes[t]) and idx < data.ignore_after_epoch[t] ] if eligible_treebanks == []: break tb = random.choice(eligible_treebanks) range_valid_tasks = data.dataset_ids[tb] idx_init, idx_end = tb_idxs[tb] train_idxs = treebank_indexes[tb][idx_init:idx_end] instance = [data.train_Ids[idx_ins] for idx_ins in train_idxs] #data.train_Ids[train_idxs] tb_idxs[tb] = (idx_end, idx_end+batch_size) else: instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu, inference=False) instance_count += 1 loss, losses, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask, range_valid_tasks, inference=False) log=True for idtask in range_valid_tasks: right, whole = predict_check(tag_seq[idtask], batch_label[idtask], mask) sample_loss[idtask]+= losses[idtask].data[0] right_token[idtask]+=right whole_token[idtask]+=whole if end%500 == 0 and log: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time log = False if sample_loss[idtask] > 1e8 or str(sample_loss) == "nan": print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." exit(0) sys.stdout.flush() for aux_idtask in range(data.HP_tasks): if whole_token[aux_idtask] == 0: print ("Task %d (no samples found)"%(aux_idtask)) else: if data.inv_dataset_ids[aux_idtask] in eligible_treebanks: print("Task %d %s Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(aux_idtask,data.inv_dataset_ids[aux_idtask],end, temp_cost, sample_loss[aux_idtask], right_token[aux_idtask], whole_token[aux_idtask],(right_token[aux_idtask]+0.)/whole_token[aux_idtask])) else: print("Task %d %s does not contain more samples; loss: %4f"%(aux_idtask,data.inv_dataset_ids[aux_idtask], losses[aux_idtask].data[0])) sample_loss[aux_idtask] = 0 total_loss += loss.data[0] loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start for aux_idtask in range(data.HP_tasks): if whole_token[aux_idtask] == 0: print ("Task %d (no samples found)"%(aux_idtask)) else: name_tb = data.inv_dataset_ids[aux_idtask] print("Task %d %s Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(aux_idtask,name_tb,len(treebank_indexes[name_tb]), temp_cost, sample_loss[aux_idtask], right_token[aux_idtask], whole_token[aux_idtask],(right_token[aux_idtask]+0.)/whole_token[aux_idtask])) sample_loss[aux_idtask] = 0 epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s"%(idx,epoch_cost, train_num/epoch_cost, total_loss)) if total_loss > 1e8 or str(total_loss) == "nan": print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." exit(0) summary = evaluate(data,model, "dev", False, False) dev_finish = time.time() dev_cost = dev_finish - epoch_finish current_scores = [] for idtask in xrange(0, data.HP_tasks): speed,acc,p,r,f,pred_labels,_,valid_indexes = summary[idtask] if data.seg: current_score = f current_scores.append(f) print("Task %d Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(idtask, dev_cost, speed, acc, p, r, f)) else: current_score = acc current_scores.append(acc) print("Task %d Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(idtask, dev_cost, speed, acc)) pred_results_tasks = [] pred_scores_tasks = [] pred_las_tasks = [] valid_indexes = None for idtask in xrange(data.HP_tasks): speed, acc, p, r, f, pred_results, pred_scores, pred_indexes = summary[idtask] pred_results_tasks.append(pred_results) pred_scores_tasks.append(pred_scores_tasks) if idtask in data.task_metric and data.task_metric[idtask] == "LAS": pred_las_tasks.append(pred_results) valid_indexes = pred_indexes with tempfile.NamedTemporaryFile() as f_decode_mt: with tempfile.NamedTemporaryFile() as f_decode_st: # If we are learning multiple task we move it as a sequence labeling if data.HP_main_tasks > 1: data.decode_dir = f_decode_mt.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_las_tasks, 'dev', indexes=valid_indexes) split_char = "{}" else: if data.decode_dir is None: data.decode_dir = f_decode_st.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_las_tasks, 'dev', indexes=valid_indexes) split_char = "@" output_nn = open(data.decode_dir) tmp = tempfile.NamedTemporaryFile().name if data.offset: decode_dependencies.decode_combined_tasks(output_nn, tmp, split_char) else: print("decoding single task") decode_dependencies.decode(output_nn, tmp, split_char) current_score = decode_dependencies.evaluate_dependencies(data.gold_dev_dep, tmp) print "Current Score (from LAS)", current_score, "Previous best dev (from LAS)", best_dev if current_score > best_dev: if data.seg: print "Exceed previous best f score:", best_dev else: print "Exceed previous best acc score:", best_dev model_name = data.model_dir +".model" print "Overwritting model to", model_name torch.save(model.state_dict(), model_name) best_dev = current_score else: print("sofar the best "+repr(best_dev)) if data.HP_tasks_weight_decays: print "Updating the weights using linear weight decay. ", print "The old weights were", data.HP_tasks_weights, data.HP_tasks_weights =[max(weight-decay,0) for weight,decay in zip(data.HP_tasks_weights, data.HP_tasks_weight_decays)] print ". The new weights are", data.HP_tasks_weights model.set_tasks_weights(data.HP_tasks_weights) gc.collect()
epoch = config_dic.get("ner_epoch") for epoch_i in range(epoch): print("Epoch: %s/%s" % (epoch_i, epoch)) lr_scheduler.step() #print(f"Learning Rate: {lr_scheduler.rate()}") print(f"Learning Rate: {lr_scheduler.get_lr()}") random_ids = list(range(len(train_word_documents))) random.shuffle(random_ids) ##################### Batch Initialize ############################ total_loss, batch_ave_loss, right_token, total_token = 0, 0, 0, 0 batch_size = config_dic.get("ner_batch_size") batch_steps = len(train_word_documents) // batch_size + 1 seq_model.train() seq_model.zero_grad() optimizer.zero_grad() for batch_i in tqdm(range(batch_steps)): #lr_scheduler.step() start_time = time.time() batch_ids = random_ids[batch_i * batch_size:(batch_i + 1) * batch_size] batch_word_documents = [train_word_documents[i] for i in batch_ids] batch_label_documents = [ train_label_documents[i] for i in batch_ids ] word_features = get_word_features(batch_word_documents, word_dic, config_dic.get("gpu")) #char_features = get_char_features(batch_word_documents, char_dic, config_dic.get("gpu")) sw_features_list = []
def train(train_data): print("Training model...") train_data.show_data_summary() save_data_name = train_data.init_dir + ".init" train_data.save(save_data_name) model = SeqModel(train_data) for name, param in model.named_parameters(): if param.requires_grad: print(name) if train_data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=train_data.HP_lr, momentum=train_data.HP_momentum, weight_decay=train_data.HP_l2) elif train_data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=train_data.HP_lr, weight_decay=train_data.HP_l2) elif train_data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=train_data.HP_lr, weight_decay=train_data.HP_l2) elif train_data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=train_data.HP_lr, weight_decay=train_data.HP_l2) elif train_data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=train_data.HP_lr, weight_decay=train_data.HP_l2) else: print("Optimizer illegal: %s" % train_data.optimizer) exit(1) best_dev = -10 dev_f = [] test_f = [] best_epoch = 0 for idx in range(train_data.HP_iteration): epoch_start = time.time() print("Epoch: %s/%s" % (idx, train_data.HP_iteration)) if train_data.optimizer.lower() == "sgd": optimizer = lr_decay(optimizer, idx, train_data.HP_lr_decay, train_data.HP_lr) random.shuffle(train_data.ner_1_train_idx) random.shuffle(train_data.ner_2_train_idx) random.shuffle(train_data.lm_1_idx) random.shuffle(train_data.lm_2_idx) model.train() model.zero_grad() ner_1_loss = 0 ner_2_loss = 0 lm_1_perplexity = 0 lm_2_perplexity = 0 ner_1_batch_size = train_data.HP_batch_size batch_nums = len(train_data.ner_1_train_idx) // ner_1_batch_size + 1 ner_2_batch_size = len(train_data.ner_2_train_idx) // batch_nums lm_1_batch_size = len(train_data.lm_1_idx) // batch_nums lm_2_batch_size = len(train_data.lm_2_idx) // batch_nums print("batch size: ", ner_1_batch_size, ner_2_batch_size, lm_1_batch_size, lm_2_batch_size) for batch_id in range(batch_nums): ner_1_data = train_data.ner_1_train_idx[batch_id * ner_1_batch_size: (batch_id + 1) * ner_1_batch_size if\ (batch_id + 1) * ner_1_batch_size < len(train_data.ner_1_train_idx) else len(train_data.ner_1_train_idx)] ner_2_data = train_data.ner_2_train_idx[batch_id * ner_2_batch_size: (batch_id + 1) * ner_2_batch_size if\ (batch_id + 1) * ner_2_batch_size < len(train_data.ner_2_train_idx) else len(train_data.ner_2_train_idx)] lm_1_data = train_data.lm_1_idx[batch_id * lm_1_batch_size: (batch_id + 1) * lm_1_batch_size if \ (batch_id + 1) * lm_1_batch_size < len(train_data.lm_1_idx) else len(train_data.lm_1_idx)] lm_2_data = train_data.lm_2_idx[batch_id * lm_2_batch_size: (batch_id + 1) * lm_2_batch_size if \ (batch_id + 1) * lm_2_batch_size < len(train_data.lm_2_idx) else len(train_data.lm_2_idx)] ner_1_batch_data = batchify_with_label(ner_1_data, train_data.HP_gpu) if train_data.mode == 'supervised': ner_2_batch_data = batchify_with_label(ner_2_data, train_data.HP_gpu) lm_1_batch_data = batchify_with_label(lm_1_data, train_data.HP_gpu) lm_2_batch_data = batchify_with_label(lm_2_data, train_data.HP_gpu) losses = [] perplexities = [] # word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, # char_seq_recover, label_seq_tensor, lm_seq_tensor, mask loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \ model.loss('ner1', ner_1_batch_data[0], ner_1_batch_data[1], ner_1_batch_data[3], ner_1_batch_data[4], ner_1_batch_data[5], ner_1_batch_data[6], ner_1_batch_data[7], ner_1_batch_data[8]) losses.append(loss) if train_data.mode == 'supervised': loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \ model.loss('ner2', ner_2_batch_data[0], ner_2_batch_data[1], ner_2_batch_data[3], ner_2_batch_data[4], ner_2_batch_data[5], ner_2_batch_data[6], ner_2_batch_data[7], ner_2_batch_data[8]) losses.append(loss) loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \ model.loss('lm1', lm_1_batch_data[0], lm_1_batch_data[1], lm_1_batch_data[3], lm_1_batch_data[4], lm_1_batch_data[5], lm_1_batch_data[6], lm_1_batch_data[7], lm_1_batch_data[8]) losses.append(loss) perplexities.append(perplexity) loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = \ model.loss('lm2', lm_2_batch_data[0], lm_2_batch_data[1], lm_2_batch_data[3], lm_2_batch_data[4], lm_2_batch_data[5], lm_2_batch_data[6], lm_2_batch_data[7], lm_2_batch_data[8]) losses.append(loss) perplexities.append(perplexity) model_loss = 0 loss_rate = [0.8, 1, 0.5, 0.5 ] if train_data.mode == 'supervised' else [1, 1, 1] for loss_id in range(len(losses)): model_loss += losses[loss_id] * loss_rate[loss_id] model_loss.backward() optimizer.step() model.zero_grad() ner_1_loss += losses[0].data[0] ner_2_loss += losses[1].data[0] lm_1_perplexity += perplexities[0].data[0] lm_2_perplexity += perplexities[1].data[0] epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print("Epoch: %s training finished. Time: %.2fs." % (idx, epoch_cost)) print("ner 1 total loss: %s" % ner_1_loss) if train_data.mode == 'supervised': print("ner 2 total loss: %s" % ner_2_loss) print("lm 1 perplexity: %.4f" % math.exp(lm_1_perplexity / batch_nums)) print("lm 2 perplexity: %.4f" % math.exp(lm_2_perplexity / batch_nums)) if ner_1_loss > 1e8 or str( ner_1_loss) == "nan" or ner_2_loss > 1e8 or str( ner_2_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) evaluate('ner1', train_data.ner_1_dev_idx, train_data.label_alphabet_ner_1, train_data, model) if train_data.mode == 'supervised': p, r, f = evaluate('ner2', train_data.ner_2_dev_idx, train_data.label_alphabet_ner_2, train_data, model) else: p, r, f = evaluate('ner2', train_data.ner_2_dev_idx, train_data.label_alphabet_ner_1, train_data, model) dev_f.append(f) if f > best_dev: best_epoch = idx print("Exceed previous best f score:", best_dev) model_name = train_data.model_dir + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = f evaluate('ner1', train_data.ner_1_test_idx, train_data.label_alphabet_ner_1, train_data, model) if train_data.mode == 'supervised': p, r, f = evaluate('ner2', train_data.ner_2_test_idx, train_data.label_alphabet_ner_2, train_data, model) else: p, r, f = evaluate('ner2', train_data.ner_2_test_idx, train_data.label_alphabet_ner_1, train_data, model) test_f.append(f) print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" % (best_epoch, dev_f[best_epoch], test_f[best_epoch]))
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("--------pytorch total params--------") print(pytorch_total_params) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_dev = -10 best_test = -10 best_epoch = -1 no_imprv_epoch = 0 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) # print (self.train_Ids) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue # label_instance = [[i for i in range(0, data.label_alphabet_size + 1)] for _ in range(len(instance))] batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, input_label_seq_tensor = batchify_with_label( instance, data.HP_gpu, data.label_alphabet_size) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask, input_label_seq_tensor) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end % 500 == 0: # temp_time = time.time() # temp_cost = temp_time - temp_start # temp_start = temp_time # print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) sys.stdout.flush() sample_loss = 0 loss.backward() if data.whether_clip_grad: from torch.nn.utils import clip_grad_norm clip_grad_norm(model.parameters(), data.clip_grad) optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) # exit(1) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) # ## decode test speed, acc_test, p, r, f_test, _, _ = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc_test, p, r, f_test)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc_test)) if current_score > best_dev: if data.seg: best_test = f_test print("Exceed previous best f score:", best_dev) else: best_test = acc_test print("Exceed previous best acc score:", best_dev) best_epoch = idx # model_name = data.model_dir +'.'+ str(idx) + ".model" # print("Save current best model in file:", model_name) # torch.save(model.state_dict(), model_name) best_dev = current_score no_imprv_epoch = 0 else: # early stop no_imprv_epoch += 1 if no_imprv_epoch >= 10: print("early stop") print("Current best f score in dev", best_dev) print("Current best f score in test", best_test) break if data.seg: print("Current best f score in dev", best_dev) print("Current best f score in test", best_test) else: print("Current best acc score in dev", best_dev) print("Current best acc score in test", best_test) gc.collect()
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) for name, param in model.named_parameters(): if param.requires_grad: print(name) if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_dev_f1 = -1 test_f1 = [] best_epoch = 0 # start training for idx in range(data.HP_iteration): epoch_start = time.time() print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 total_perplexity_1 = 0 total_perplexity_2 = 0 total_loss_1 = 0 total_loss_2 = 0 total_loss_3 = 0 total_loss_4 = 0 random.shuffle(data.source_train_idx) random.shuffle(data.target_train_idx) random.shuffle(data.source_lm_idx) random.shuffle(data.target_lm_idx) model.train() model.zero_grad() batch_size_1 = data.HP_batch_size train_num_1 = len(data.source_train_idx) train_num_2 = len(data.target_train_idx) train_num_3 = len(data.source_lm_idx) train_num_4 = len(data.target_lm_idx) batch_num = train_num_1 // batch_size_1 + 1 batch_size_2 = train_num_2 // batch_num batch_size_3 = train_num_3 // batch_num batch_size_4 = train_num_4 // batch_num for batch_id in range(batch_num): instance_1 = data.source_train_idx[batch_id * batch_size_1: (batch_id + 1) * batch_size_1 if ((batch_id + 1) * batch_size_1) < train_num_1 else train_num_1] instance_2 = data.target_train_idx[batch_id * batch_size_2: (batch_id + 1) * batch_size_2 if ((batch_id + 1) * batch_size_2) < train_num_2 else train_num_2] instance_3 = data.source_lm_idx[batch_id * batch_size_3: (batch_id + 1) * batch_size_3 if ((batch_id + 1) * batch_size_3) < train_num_3 else train_num_3] instance_4 = data.target_lm_idx[batch_id * batch_size_4: (batch_id + 1) * batch_size_4 if ((batch_id + 1) * batch_size_4) < train_num_4 else train_num_4] if not instance_1 or not instance_2: continue # NER batch_word_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, \ batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label(instance_1, data.HP_gpu) batch_word_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, \ batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label(instance_2, data.HP_gpu) # LM batch_word_3, batch_wordlen_3, batch_wordrecover_3, batch_char_3, batch_charlen_3, \ batch_charrecover_3, batch_label_3, lm_seq_tensor_3, mask_3 = batchify_with_label(instance_3 + instance_1, data.HP_gpu) batch_word_4, batch_wordlen_4, batch_wordrecover_4, batch_char_4, batch_charlen_4, \ batch_charrecover_4, batch_label_4, lm_seq_tensor_4, mask_4 = batchify_with_label(instance_4 + instance_2, data.HP_gpu) batch_word = [batch_word_1, batch_word_2, batch_word_3, batch_word_4] batch_wordlen = [batch_wordlen_1, batch_wordlen_2, batch_wordlen_3, batch_wordlen_4] batch_char = [batch_char_1, batch_char_2, batch_char_3, batch_char_4] batch_charlen = [batch_charlen_1, batch_charlen_2, batch_charlen_3, batch_charlen_4] batch_charrecover = [batch_charrecover_1, batch_charrecover_2, batch_charrecover_3, batch_charrecover_4] batch_label = [batch_label_1, batch_label_2, batch_label_3, batch_label_4] lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2, lm_seq_tensor_3, lm_seq_tensor_4] mask = [mask_1, mask_2, mask_3, mask_4] instance_count += 1 loss_ = [] perplexity_ = [] # source language model loss, perplexity, tag_seq = model.loss('model1', batch_word[2], batch_wordlen[2], batch_char[2], batch_charlen[2], batch_charrecover[2], batch_label[2], lm_seq_tensor[2], mask[2]) loss_.append(loss) perplexity_.append(perplexity) # source NER loss, perplexity, tag_seq = model.loss('model2', batch_word[0], batch_wordlen[0], batch_char[0], batch_charlen[0], batch_charrecover[0], batch_label[0], lm_seq_tensor[0], mask[0]) loss_.append(loss) # target language model loss, perplexity, tag_seq = model.loss('model3', batch_word[3], batch_wordlen[3], batch_char[3], batch_charlen[3], batch_charrecover[3], batch_label[3], lm_seq_tensor[3], mask[3]) loss_.append(loss) perplexity_.append(perplexity) loss = 0 model_num = len(loss_) for loss_id in range(model_num): loss += loss_[loss_id] loss.backward() optimizer.step() model.zero_grad() total_loss_1 += loss_[0].data[0] total_loss_2 += loss_[1].data[0] total_loss_3 += loss_[2].data[0] total_perplexity_1 += perplexity_[0].data[0] total_perplexity_2 += perplexity_[1].data[0] epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start source_lm_perplexity = math.exp(total_perplexity_1 / batch_num) target_lm_perplexity = math.exp(total_perplexity_2 / batch_num) print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % ( idx, epoch_cost, train_num_1 / epoch_cost, total_loss_2)) print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total perplexity: %.4f" % ( idx, epoch_cost, train_num_3 / epoch_cost, source_lm_perplexity)) print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % ( idx, epoch_cost, train_num_2 / epoch_cost, total_loss_4)) print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total perplexity: %.4f" % ( idx, epoch_cost, train_num_4 / epoch_cost, target_lm_perplexity)) if total_loss_1 > 1e8 or str(total_loss_1) == "nan" or total_loss_2 > 1e8 or str( total_loss_2) == "nan" or total_loss_3 > 1e8 or str(total_loss_3) == "nan" or total_loss_4 > 1e8 or str( total_loss_4) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) # dev-test speed, acc, p, r, f, _, _ = evaluate(data, model, "dev-test") test_f1.append(f[1]) dev_finish = time.time() dev_cost = dev_finish - epoch_finish current_score = f[0] print("Dev-Source: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % ( dev_cost, speed, acc[0], p[0], r[0], f[0])) print("Test-Target: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % ( dev_cost, speed, acc[1], p[1], r[1], f[1])) if current_score > best_dev_f1: best_epoch = idx print("Exceed previous best f score:", best_dev_f1) model_name = data.model_dir + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev_f1 = current_score if current_score > 0.72: print("change optim sgd:") optimizer = optim.SGD(model.parameters(), lr=0.015, momentum=data.HP_momentum, weight_decay=data.HP_l2) print("The best Source-domain dev f-score: %.4f, Target-domain f-score: %.4f" % (best_dev_f1, test_f1[best_epoch]))
def train(data): print "Training model..." model = SeqModel(data) print "model:{}".format(model) if data.gpu: model.cuda() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.lr, momentum=data.momentum, weight_decay=data.l2) if data.use_mapping: optimizer_wc = optim.SGD(model.word_hidden.wordrep.w.parameters(), lr=data.lr, momentum=data.momentum, weight_decay=data.l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.lr, weight_decay=data.l2) if data.use_mapping: optimizer_wc = optim.Adam(model.word_hidden.wordrep.w.parameters(), lr=data.lr, weight_decay=data.l2) else: print("Optimizer illegal: %s , use sgd or adam." % data.optimizer) exit(0) best_dev = -10 best_dev_epoch = -1 best_test = -10 best_test_epoch = -1 # start training for idx in range(data.iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx + 1, data.iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.lr_decay, data.lr) if data.use_mapping: optimizer_wc = lr_decay(optimizer_wc, idx, data.lr_decay, data.lr) instance_count = 0 sample_id = 0 # sample_loss = 0 sample_mapping_loss = 0 total_loss = 0 total_mapping_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) # set model in train mode model.train() model.zero_grad() batch_size = data.batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, batch_trans, trans_seq_lengths, trans_seq_recover, mask = batchify_with_label( instance, data.gpu) instance_count += 1 loss, tag_seq, wc_loss = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask, batch_trans, trans_seq_lengths, trans_seq_recover) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole #sample_loss += loss.data[0] sample_loss += loss.data.item() if data.use_mapping: sample_mapping_loss += wc_loss.data[0] #total_loss += loss.data[0] total_loss += loss.data.item() if data.use_mapping: total_mapping_loss += wc_loss.data[0] if batch_id % data.show_loss_per_batch == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time if data.use_mapping: print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (batch_id, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) else: print( " Instance: %s; Time: %.2fs; loss: %.4f;mapping_loss: %.4f; acc: %s/%s=%.4f" % (batch_id, temp_cost, sample_loss, sample_mapping_loss, right_token, whole_token, (right_token + 0.) / whole_token)) sys.stdout.flush() sample_loss = 0 sample_mapping_loss = 0 if data.use_trans and data.use_mapping: for param in model.word_hidden.wordrep.w.parameters(): param.requires_grad = False loss.backward(retain_graph=True) if data.clip != None: torch.nn.utils.clip_grad_norm(model.parameters(), data.clip) optimizer.step() model.zero_grad() for param in model.word_hidden.wordrep.w.parameters(): param.requires_grad = True wc_loss.backward() optimizer_wc.step() model.zero_grad() else: loss.backward() # torch.nn.utils.clip_grad_norm(model.parameters(), data.clip) optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start if data.use_mapping: print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (batch_id, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) else: print( " Instance: %s; Time: %.2fs; loss: %.4f;mapping_loss: %.4f; acc: %s/%s=%.4f" % (batch_id, temp_cost, sample_loss, sample_mapping_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start if data.use_mapping: print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s,total mapping loss: %s" % (idx + 1, epoch_cost, train_num / epoch_cost, total_loss, total_mapping_loss)) else: print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx + 1, epoch_cost, train_num / epoch_cost, total_loss)) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev", data.nbest) dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print "Exceed previous best f score:", best_dev else: print "Exceed previous best acc score:", best_dev if data.save_model: model_name = data.model_dir + data.state_training_name + '.' + str( current_score)[2:-1] print "Save current best model in file:", model_name torch.save(model.state_dict(), model_name) best_dev = current_score best_dev_epoch = idx # ## decode test speed, acc, p, r, f, _, _ = evaluate(data, model, "test", data.nbest) if f > best_test: best_test = f best_test_epoch = idx test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc)) print('best_dev_score: %.4f, best_dev_epoch:%d' % (best_dev, best_dev_epoch)) print('best_test_score: %.4f, best_test_epoch:%d' % (best_test, best_test_epoch)) gc.collect()
def train(data): ''' initialize model and train ''' print('Training model...') data.show_data_summary() # save dset save_data_name = data.model_dir + '.dset' data.save(save_data_name) # save exportable model architecture (for deployment) data.save_export(data.model_dir + '.xpt') model = SeqModel(data) if data.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == 'adadelta': optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == 'adam': optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print('Optimizer illegal: {}'.format(data.optimizer)) exit(1) best_dev = -10 ## start training for idx in range(data.iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch {}/{}".format(idx, data.iteration)) if data.optimizer.lower() == 'sgd': optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train mode model.train() model.zero_grad() batch_size = data.batch_size #batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] instance_texts = data.train_texts[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, volatile_flag=False, label_flag=True) #print(batch_char.size()) #print(batch_char.max()) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.item() total_loss += loss.item() if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print(' Instance {}; Time {:.2}s; loss {:.4}; acc {}/{}={:.4}'. format(end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) if sample_loss > 1e8 or str(sample_loss) == 'nan': print( 'ERROR: LOSS EXPLOSION (>1e8) ! Please set adapted parameters and structure! EXIT ...' ) exit(1) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(' Instance {}; Time {:.2}s; loss {:.4}; acc {}/{}={:.4}'.format( end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( ' Epoch: {} training finished. Time: {:.2}s; speed: {:.2}st/s; total loss: {}' .format(idx, epoch_cost, train_num / epoch_cost, total_loss)) if total_loss > 1e8 or str(sample_loss) == 'nan': print( 'ERROR: LOSS EXPLOSION (>1e8) ! Please set adapted parameters and structure! EXIT ...' ) exit(1) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish # saving dev results json for model analysis dev_res = tuple((speed, acc, p, r, f)) if data.seg: current_score = f print( "Dev: time: {:.2}s, speed {:.2}st/s; acc: {:.4}, p: {:.4}, r: {:.4}, f: {:.4}" .format(dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: {:.2}s, speed {:.2}st/s; acc: {:.4}".format( dev_cost, speed, acc)) # decode test speed, acc, p, r, f, _, _ = evaluate(data, model, 'test') test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: {:.2}s, speed {:.2}st/s; acc: {:.4}, p: {:.4}, r: {:.4}, f: {:.4}" .format(dev_cost, speed, acc, p, r, f)) else: print("Test: time: {:.2}s, speed {:.2}st/s; acc: {:.4}".format( dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print('"Exceed previous best f score:', best_dev) else: print('"Exceed previous best acc score:', best_dev) model_name = data.model_dir + '.' + str(idx) + '.model' print('Save current best model in file:', model_name) torch.save(model.state_dict(), model_name) best_dev = current_score path2info = data.model_dir + '.infos' save_infos(data, dev_res, path2info) gc.collect() print('Training done!') return best_dev
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) for name, param in model.named_parameters(): if param.requires_grad: print(name) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_dev = -10 dev_f = [] test_f = [] perplexity_1 = [] perplexity_2 = [] best_epoch = 0 # data.HP_iteration = 1 LM_data = data.train_Ids_2 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 total_perplexity_1 = 0 total_perplexity_2 = 0 total_loss_1 = 0 total_loss_2 = 0 total_loss_3 = 0 total_loss_4 = 0 random.shuffle(data.train_Ids_1) random.shuffle(data.train_Ids_2) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 ###co-train for 4 models train_num_1 = len(data.train_Ids_1) train_num_2 = len(data.train_Ids_2) train_num_3 = len(LM_data) total_batch_1 = train_num_1 // batch_size + 1 batch_size_2 = train_num_2 // total_batch_1 l_batch_num_2 = train_num_2 - total_batch_1 * batch_size_2 start_2 = end_2 = 0 for batch_id in range(total_batch_1): start = batch_id * batch_size end = (batch_id + 1) * batch_size start_2 = end_2 if batch_id < l_batch_num_2: end_2 = start_2 + (batch_size_2 + 1) else: end_2 = start_2 + batch_size_2 if end > train_num_1: end = train_num_1 if end_2 > train_num_2: end_2 = train_num_2 instance_1 = data.train_Ids_1[start:end] instance_2 = data.train_Ids_2[start_2:end_2] if not instance_1 or not instance_2: continue #seq label batch_word_1, batch_features_1, batch_wordlen_1, batch_wordrecover_1, batch_char_1, batch_charlen_1, batch_charrecover_1, batch_label_1, lm_seq_tensor_1, mask_1 = batchify_with_label( instance_1, data.HP_gpu) batch_word_2, batch_features_2, batch_wordlen_2, batch_wordrecover_2, batch_char_2, batch_charlen_2, batch_charrecover_2, batch_label_2, lm_seq_tensor_2, mask_2 = batchify_with_label( instance_2, data.HP_gpu) batch_word = [batch_word_1, batch_word_2] batch_features = [batch_features_1, batch_features_2] batch_wordlen = [batch_wordlen_1, batch_wordlen_2] batch_char = [batch_char_1, batch_char_2] batch_charlen = [batch_charlen_1, batch_charlen_2] batch_charrecover = [batch_charrecover_1, batch_charrecover_2] batch_label = [batch_label_1, batch_label_2] lm_seq_tensor = [lm_seq_tensor_1, lm_seq_tensor_2] mask = [mask_1, mask_2] instance_count += 1 loss_ = [] perplexity_ = [] # LM 1 loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss( 'model1', batch_word[0], batch_features[0], batch_wordlen[0], batch_char[0], batch_charlen[0], batch_charrecover[0], batch_label[0], lm_seq_tensor[0], mask[0]) loss_.append(loss) perplexity_.append(perplexity) #seq label 1 loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss( 'model2', batch_word[0], batch_features[0], batch_wordlen[0], batch_char[0], batch_charlen[0], batch_charrecover[0], batch_label[0], lm_seq_tensor[0], mask[0]) loss_.append(loss) # LM 2 loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss( 'model3', batch_word[1], batch_features[1], batch_wordlen[1], batch_char[1], batch_charlen[1], batch_charrecover[1], batch_label[1], lm_seq_tensor[1], mask[1]) loss_.append(loss) perplexity_.append(perplexity) #seq label 2 loss, perplexity, tag_seq_forward, tag_seq_backward, tag_seq = model.loss( 'model4', batch_word[1], batch_features[1], batch_wordlen[1], batch_char[1], batch_charlen[1], batch_charrecover[1], batch_label[1], lm_seq_tensor[1], mask[1]) loss_.append(loss) loss_rate = [1.0, 1.0, 1.0, 2.0] loss = 0 model_num = len(loss_) for loss_id in range(model_num): loss += loss_[loss_id] * loss_rate[loss_id] loss.backward() optimizer.step() model.zero_grad() total_loss_1 += loss_[0].data[0] total_loss_2 += loss_[1].data[0] total_loss_3 += loss_[2].data[0] total_loss_4 += loss_[3].data[0] total_perplexity_1 += perplexity_[0].data[0] total_perplexity_2 += perplexity_[1].data[0] epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start LM_perplex_1 = math.exp(total_perplexity_1 / total_batch_1) LM_perplex_2 = math.exp(total_perplexity_2 / total_batch_1) perplexity_1.append(LM_perplex_1) perplexity_2.append(LM_perplex_2) print("Epoch: %s training finished. Time: %.2fs" % (idx, epoch_cost)) print("Epoch: %s training finished. Time: %.2fs, total loss: %s" % (idx, epoch_cost, total_loss_2)) print("totalloss:", total_loss_2) print( "Epoch: %s training finished. Time: %.2fs, total perplexity: %.4f" % (idx, epoch_cost, LM_perplex_1)) print("Epoch: %s training finished. Time: %.2fs, total loss: %s" % (idx, epoch_cost, total_loss_4)) print("totalloss:", total_loss_4) print( "Epoch: %s training finished. Time: %.2fs, total perplexity: %.4f" % (idx, epoch_cost, LM_perplex_2)) speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_f.append(f[1]) dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f[1] print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc[0], p[0], r[0], f[0])) print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc[1], p[1], r[1], f[1])) else: current_score = acc[1] print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc[0])) print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc[1])) if current_score > best_dev: best_epoch = idx if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) # model_name = data.model_dir +'.'+ str(idx) + ".model" model_name = data.model_dir + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _, _ = evaluate(data, model, "test") test_f.append(f[1]) test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc[0], p[0], r[0], f[0])) print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc[1], p[1], r[1], f[1])) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc[0])) print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc[1])) gc.collect() print("the best dev score is in epoch %s, dev:%.4f, test:%.4f" % (best_epoch, dev_f[best_epoch], test_f[best_epoch])) with open('data/fscore_13PC.txt', 'w') as ft: ft.write('dev f scores:\n') for t in dev_f: ft.write(str(round(t, 6))) ft.write(' ') ft.write('\n') ft.write('test f scores:\n') for t in test_f: ft.write(str(round(t, 6))) ft.write(' ') ft.write('\n') ft.write('LM 1 perplexity:\n') for t in perplexity_1: ft.write(str(round(t, 6))) ft.write(' ') ft.write('\n') ft.write('LM 2 perplexity:\n') for t in perplexity_2: ft.write(str(round(t, 6))) ft.write(' ') if data.task_emb_save_dir is not None: with open('data/task_emb.txt', 'w') as ft: for task, i in data.task_alphabet.iteritems(): ft.write(task) ft.write(' ') for t in model.word_hidden.LSTM_param_generator.task_emb_vocab.weight.data[ i]: ft.write(str(round(t, 6))) ft.write(' ') ft.write('\n') if data.domain_emb_save_dir is not None: with open('data/domain_emb.txt', 'w') as fd: for domain, i in data.domain_alphabet.iteritems(): fd.write(domain) fd.write(' ') for t in model.word_hidden.LSTM_param_generator.domain_emb_vocab.weight.data[ i]: fd.write(str(round(t, 6))) fd.write(' ') fd.write('\n')
def pipeline(data, ner_dir, re_dir): seq_model = SeqModel(data) seq_wordseq = WordSequence(data, False, True, True, data.use_char) classify_wordseq = WordSequence(data, True, False, True, False) classify_model = ClassifyModel(data) if torch.cuda.is_available(): classify_model = classify_model.cuda(data.HP_gpu) iter_parameter = itertools.chain( *map(list, [seq_wordseq.parameters(), seq_model.parameters()])) seq_optimizer = optim.Adam(iter_parameter, lr=opt.ner_lr, weight_decay=data.HP_l2) iter_parameter = itertools.chain(*map( list, [classify_wordseq.parameters(), classify_model.parameters()])) classify_optimizer = optim.Adam(iter_parameter, lr=opt.re_lr, weight_decay=data.HP_l2) if data.tune_wordemb == False: my_utils.freeze_net(seq_wordseq.wordrep.word_embedding) my_utils.freeze_net(classify_wordseq.wordrep.word_embedding) re_X_positive = [] re_Y_positive = [] re_X_negative = [] re_Y_negative = [] relation_vocab = data.re_feature_alphabets[ data.re_feature_name2id['[RELATION]']] my_collate = my_utils.sorted_collate1 for i in range(len(data.re_train_X)): x = data.re_train_X[i] y = data.re_train_Y[i] if y != relation_vocab.get_index("</unk>"): re_X_positive.append(x) re_Y_positive.append(y) else: re_X_negative.append(x) re_Y_negative.append(y) re_test_loader = DataLoader(my_utils.RelationDataset( data.re_test_X, data.re_test_Y), data.HP_batch_size, shuffle=False, collate_fn=my_collate) best_ner_score = -1 best_re_score = -1 for idx in range(data.HP_iteration): epoch_start = time.time() seq_wordseq.train() seq_wordseq.zero_grad() seq_model.train() seq_model.zero_grad() classify_wordseq.train() classify_wordseq.zero_grad() classify_model.train() classify_model.zero_grad() batch_size = data.HP_batch_size random.shuffle(data.train_Ids) ner_train_num = len(data.train_Ids) ner_total_batch = ner_train_num // batch_size + 1 re_train_loader, re_train_iter = makeRelationDataset( re_X_positive, re_Y_positive, re_X_negative, re_Y_negative, data.unk_ratio, True, my_collate, data.HP_batch_size) re_total_batch = len(re_train_loader) total_batch = max(ner_total_batch, re_total_batch) min_batch = min(ner_total_batch, re_total_batch) for batch_id in range(total_batch): if batch_id < ner_total_batch: start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > ner_train_num: end = ner_train_num instance = data.train_Ids[start:end] batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, \ batch_permute_label = batchify_with_label(instance, data.HP_gpu) hidden = seq_wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None) hidden_adv = None loss, tag_seq = seq_model.neg_log_likelihood_loss( hidden, hidden_adv, batch_label, mask) loss.backward() seq_optimizer.step() seq_wordseq.zero_grad() seq_model.zero_grad() if batch_id < re_total_batch: [batch_word, batch_features, batch_wordlen, batch_wordrecover, \ batch_char, batch_charlen, batch_charrecover, \ position1_seq_tensor, position2_seq_tensor, e1_token, e1_length, e2_token, e2_length, e1_type, e2_type, \ tok_num_betw, et_num], [targets, targets_permute] = my_utils.endless_get_next_batch_without_rebatch1( re_train_loader, re_train_iter) hidden = classify_wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, position1_seq_tensor, position2_seq_tensor) hidden_adv = None loss, pred = classify_model.neg_log_likelihood_loss( hidden, hidden_adv, batch_wordlen, e1_token, e1_length, e2_token, e2_length, e1_type, e2_type, tok_num_betw, et_num, targets) loss.backward() classify_optimizer.step() classify_wordseq.zero_grad() classify_model.zero_grad() epoch_finish = time.time() print("epoch: %s training finished. Time: %.2fs" % (idx, epoch_finish - epoch_start)) # _, _, _, _, f, _, _ = ner.evaluate(data, seq_wordseq, seq_model, "test") ner_score = ner.evaluate1(data, seq_wordseq, seq_model, "test") print("ner evaluate: f: %.4f" % (ner_score)) re_score = relation_extraction.evaluate(classify_wordseq, classify_model, re_test_loader) print("re evaluate: f: %.4f" % (re_score)) if ner_score + re_score > best_ner_score + best_re_score: print("new best score: ner: %.4f , re: %.4f" % (ner_score, re_score)) best_ner_score = ner_score best_re_score = re_score torch.save(seq_wordseq.state_dict(), os.path.join(ner_dir, 'wordseq.pkl')) torch.save(seq_model.state_dict(), os.path.join(ner_dir, 'model.pkl')) torch.save(classify_wordseq.state_dict(), os.path.join(re_dir, 'wordseq.pkl')) torch.save(classify_model.state_dict(), os.path.join(re_dir, 'model.pkl'))
def train(data): print "Training model..." data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(0) best_dev = -10 for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 sample_loss = {idtask: 0 for idtask in range(data.HP_tasks)} right_token = {idtask: 0 for idtask in range(data.HP_tasks)} whole_token = {idtask: 0 for idtask in range(data.HP_tasks)} random.shuffle(data.train_Ids) model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, inference=False) instance_count += 1 loss, losses, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask, inference=False) for idtask in range(data.HP_tasks): right, whole = predict_check(tag_seq[idtask], batch_label[idtask], mask) sample_loss[idtask] += losses[idtask].data[0] right_token[idtask] += right whole_token[idtask] += whole if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Task %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, idtask, temp_cost, sample_loss[idtask], right_token[idtask], whole_token[idtask], (right_token[idtask] + 0.) / whole_token[idtask])) if sample_loss[idtask] > 1e8 or str(sample_loss) == "nan": print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." exit(0) sys.stdout.flush() sample_loss[idtask] = 0 if end % 500 == 0: print "--------------------------------------------------------------------------" total_loss += loss.data[0] loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start for idtask in range(data.HP_tasks): print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss[idtask], right_token[idtask], whole_token[idtask], (right_token[idtask] + 0.) / whole_token[idtask])) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) print "totalloss:", total_loss if total_loss > 1e8 or str(total_loss) == "nan": print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." exit(0) summary = evaluate(data, model, "dev", False, False) dev_finish = time.time() dev_cost = dev_finish - epoch_finish current_scores = [] for idtask in xrange(0, data.HP_tasks): speed, acc, p, r, f, pred_labels, _ = summary[idtask] if data.seg: current_score = f current_scores.append(f) print( "Task %d Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (idtask, dev_cost, speed, acc, p, r, f)) else: current_score = acc current_scores.append(acc) print("Task %d Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (idtask, dev_cost, speed, acc)) pred_results_tasks = [] pred_scores_tasks = [] for idtask in xrange(data.HP_tasks): speed, acc, p, r, f, pred_results, pred_scores = summary[idtask] pred_results_tasks.append(pred_results) pred_scores_tasks.append(pred_scores_tasks) # EVALUATING ON DEV SET FOR CHOOSING THE BEST MODEL # MULTITASK LEARNING OF BOTH CONSTITUENCY AND DEPENDENCY PARSING if data.dependency_parsing and data.constituency_parsing: # CONSTITUENCY PARSING with tempfile.NamedTemporaryFile() as f_decode_mt: with tempfile.NamedTemporaryFile() as f_decode_st: if len(data.index_of_main_tasks) > 1: data.decode_dir = f_decode_mt.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') # transform between @ and {} rebuild.rebuild_tree(data.decode_dir, decoded_st_dir) else: if data.decode_dir is None: data.decode_dir = f_decode_st.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') # evaluate the output comparing to the gold command = [ "PYTHONPATH=" + data.cons2label, "python", data.evaluate, " --input ", decoded_st_dir, " --gold ", data.gold_dev_cons, " --evalb ", data.evalb, ">", f_decode_mt.name ] os.system(" ".join(command)) current_score_cons = float([ l for l in f_decode_mt.read().split("\n") if l.startswith("Bracketing FMeasure") ][0].split("=")[1]) print(current_score_cons) # DEPENDENCY PARSING with tempfile.NamedTemporaryFile() as f_decode_mt: with tempfile.NamedTemporaryFile() as f_decode_st: if len(data.index_of_main_tasks) > 1: data.decode_dir = f_decode_mt.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') else: print("else") if data.decode_dir is None: data.decode_dir = f_decode_st.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') output_nn = open(data.decode_dir) tmp = tempfile.NamedTemporaryFile().name decodeDependencies.decode(output_nn, tmp, data.language) current_score_depen = float( decodeDependencies.evaluateDependencies( data.gold_dev_dep, tmp)) print(current_score_depen) # SINGLE OR MULTITASK CONSTITUENCY PARSING elif data.constituency_parsing: with tempfile.NamedTemporaryFile() as f_decode_mt: with tempfile.NamedTemporaryFile() as f_decode_st: if len(data.index_of_main_tasks) > 1: data.decode_dir = f_decode_mt.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') # transform between @ and {} rebuild.rebuild_tree(data.decode_dir, decoded_st_dir) else: if data.decode_dir is None: data.decode_dir = f_decode_st.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') command = [ "PYTHONPATH=" + data.cons2label, "python", data.evaluate, " --input ", decoded_st_dir, " --gold ", data.gold_dev_cons, " --evalb ", data.evalb, ">", f_decode_mt.name ] os.system(" ".join(command)) current_score = float([ l for l in f_decode_mt.read().split("\n") if l.startswith("Bracketing FMeasure") ][0].split("=")[1]) print "Current Score (from EVALB)", current_score, "Previous best dev (from EVALB)", best_dev # SINGLE OR MULTITASK DEPENDENCY PARSING elif data.dependency_parsing: with tempfile.NamedTemporaryFile() as f_decode_mt: with tempfile.NamedTemporaryFile() as f_decode_st: # If we are learning multiple task we move it as a sequence # labeling if len(data.index_of_main_tasks) > 1: data.decode_dir = f_decode_mt.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') else: if data.decode_dir is None: data.decode_dir = f_decode_st.name decoded_st_dir = f_decode_st.name data.write_decoded_results(pred_results_tasks, 'dev') output_nn = open(data.decode_dir) tmp = tempfile.NamedTemporaryFile().name decodeDependencies.decode(output_nn, tmp, data.language) current_score = decodeDependencies.evaluateDependencies( data.gold_dev_dep, tmp) print "Current Score (from LAS)", current_score, "Previous best dev (from LAS)", best_dev else: current_score = sum(current_scores) / len(current_scores) print "Current Score", current_score, "Previous best dev", best_dev # SAVE THE BEST MODEL # by default save model with highest harmonic mean when parsing both # dependency and constituency trees if data.dependency_parsing and data.constituency_parsing: harmonic_mean = (2 * current_score_cons * current_score_depen) / \ (current_score_cons + current_score_depen) if harmonic_mean > best_dev: print("New harmonic mean " + repr(harmonic_mean)) print("Exceed previous best harmonic mean score: " + repr(best_dev) + " LAS " + repr(current_score_depen) + " F1 " + repr(current_score_cons)) model_name = data.model_dir + ".model" print "Overwritting model to", model_name torch.save(model.state_dict(), model_name) best_dev = harmonic_mean else: print("sofar the best " + repr(best_dev)) else: if current_score > best_dev: if data.seg: print "Exceed previous best f score:", best_dev else: print "Exceed previous best acc score:", best_dev model_name = data.model_dir + ".model" print "Overwritting model to", model_name torch.save(model.state_dict(), model_name) best_dev = current_score else: print("sofar the best " + repr(best_dev)) summary = evaluate(data, model, "test", False) test_finish = time.time() test_cost = test_finish - dev_finish for idtask in xrange(0, data.HP_tasks): speed, acc, p, r, f, _, _ = summary[idtask] if data.seg: current_score = f print( "Task %d Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (idtask, test_cost, speed, acc, p, r, f)) else: current_score = acc print("Task %d Test: time: %.2fs speed: %.2fst/s; acc: %.4f" % (idtask, test_cost, speed, acc)) gc.collect()
def train(data, model_file): print "Training model..." model = SeqModel(data) wordseq = WordSequence(data, False, True, data.use_char) if opt.self_adv == 'grad': wordseq_adv = WordSequence(data, False, True, data.use_char) elif opt.self_adv == 'label': wordseq_adv = WordSequence(data, False, True, data.use_char) model_adv = SeqModel(data) else: wordseq_adv = None if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": if opt.self_adv == 'grad': iter_parameter = itertools.chain(*map(list, [wordseq.parameters(), wordseq_adv.parameters(), model.parameters()])) optimizer = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2) elif opt.self_adv == 'label': iter_parameter = itertools.chain(*map(list, [wordseq.parameters(), model.parameters()])) optimizer = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2) iter_parameter = itertools.chain(*map(list, [wordseq_adv.parameters(), model_adv.parameters()])) optimizer_adv = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2) else: iter_parameter = itertools.chain(*map(list, [wordseq.parameters(), model.parameters()])) optimizer = optim.Adam(iter_parameter, lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(0) best_dev = -10 if data.tune_wordemb == False: my_utils.freeze_net(wordseq.wordrep.word_embedding) if opt.self_adv != 'no': my_utils.freeze_net(wordseq_adv.wordrep.word_embedding) # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model wordseq.train() wordseq.zero_grad() if opt.self_adv == 'grad': wordseq_adv.train() wordseq_adv.zero_grad() elif opt.self_adv == 'label': wordseq_adv.train() wordseq_adv.zero_grad() model_adv.train() model_adv.zero_grad() model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask,\ batch_permute_label = batchify_with_label(instance, data.HP_gpu) instance_count += 1 if opt.self_adv == 'grad': hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None) hidden_adv = wordseq_adv.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None) loss, tag_seq = model.neg_log_likelihood_loss(hidden, hidden_adv, batch_label, mask) loss.backward() my_utils.reverse_grad(wordseq_adv) optimizer.step() wordseq.zero_grad() wordseq_adv.zero_grad() model.zero_grad() elif opt.self_adv == 'label' : wordseq.unfreeze_net() wordseq_adv.freeze_net() hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None) hidden_adv = wordseq_adv.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None) loss, tag_seq = model.neg_log_likelihood_loss(hidden, hidden_adv, batch_label, mask) loss.backward() optimizer.step() wordseq.zero_grad() wordseq_adv.zero_grad() model.zero_grad() wordseq.freeze_net() wordseq_adv.unfreeze_net() hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None) hidden_adv = wordseq_adv.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None) loss_adv, _ = model_adv.neg_log_likelihood_loss(hidden, hidden_adv, batch_permute_label, mask) loss_adv.backward() optimizer_adv.step() wordseq.zero_grad() wordseq_adv.zero_grad() model_adv.zero_grad() else: hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None) hidden_adv = None loss, tag_seq = model.neg_log_likelihood_loss(hidden, hidden_adv, batch_label, mask) loss.backward() optimizer.step() wordseq.zero_grad() model.zero_grad() # right, whole = predict_check(tag_seq, batch_label, mask) # right_token += right # whole_token += whole sample_loss += loss.data.item() total_loss += loss.data.item() if end % 500 == 0: # temp_time = time.time() # temp_cost = temp_time - temp_start # temp_start = temp_time # print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % ( # end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." exit(0) sys.stdout.flush() sample_loss = 0 # temp_time = time.time() # temp_cost = temp_time - temp_start # print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % ( # end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print("epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % ( idx, epoch_cost, train_num / epoch_cost, total_loss)) print "totalloss:", total_loss if total_loss > 1e8 or str(total_loss) == "nan": print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." exit(0) # continue speed, acc, p, r, f, _, _ = evaluate(data, wordseq, model, "test") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % ( dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print "Exceed previous best f score:", best_dev else: print "Exceed previous best acc score:", best_dev torch.save(wordseq.state_dict(), os.path.join(model_file, 'wordseq.pkl')) if opt.self_adv == 'grad': torch.save(wordseq_adv.state_dict(), os.path.join(model_file, 'wordseq_adv.pkl')) elif opt.self_adv == 'label': torch.save(wordseq_adv.state_dict(), os.path.join(model_file, 'wordseq_adv.pkl')) torch.save(model_adv.state_dict(), os.path.join(model_file, 'model_adv.pkl')) model_name = os.path.join(model_file, 'model.pkl') torch.save(model.state_dict(), model_name) best_dev = current_score gc.collect()
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir +".dset" data.save(save_data_name) model = SeqModel(data) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s"%(data.optimizer)) exit(0) best_dev = -10 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" %(idx,data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end%500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token)) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss)) # continue speed, acc, p, r, f, _,_ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = data.model_dir +'.'+ str(idx) + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _,_ = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc)) gc.collect()
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + "/data.dset" if data.save_model: data.save(save_data_name) batch_size = data.HP_batch_size train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 model = SeqModel(data) pytorch_total_params = sum(p.numel() for p in model.parameters()) print(model) print("pytorch total params: %d" % pytorch_total_params) ## model 1 optimizer lr_detail1 = [ { "params": filter(lambda p: p.requires_grad, model.mcmodel.parameters()), "lr": data.HP_lr }, ] if data.optimizer.lower() == "sgd": optimizer = optim.SGD(lr_detail1, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(lr_detail1, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(lr_detail1, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(lr_detail1, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(lr_detail1, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) ## model 2 optimizer optimizer2 = AdamW(model.get_m2_params(), lr=data.HP_lr2, weight_decay=data.HP_l2) t_total = total_batch * data.HP_iteration warmup_step = int(data.warmup_step * t_total) scheduler2 = WarmupLinearSchedule(optimizer2, warmup_step, t_total) best_dev = -10 best_test = -10 max_test = -10 max_test_epoch = -1 max_dev_epoch = -1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() print("\n ###### Epoch: %s/%s ######" % (idx, data.HP_iteration)) # print (self.train_Ids) if data.optimizer.lower() == "sgd": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) sample_loss = 0 total_loss = 0 random.shuffle(data.train_Ids) model.train() model.zero_grad() for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True) loss, tag_seqs = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) sample_loss += loss.item() total_loss += loss.item() if end % 500 == 0: if sample_loss > 1e8 or str(sample_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) sys.stdout.flush() sample_loss = 0 loss.backward() clip_grad_norm_(model.parameters(), data.clip_grad) optimizer.step() optimizer2.step() scheduler2.step() model.zero_grad() epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2f s, speed: %.2f doc/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) if total_loss > 1e8 or str(total_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) # dev dev_score, _ = evaluate(data, model, "dev") # test test_score, _ = evaluate(data, model, "test") if max_test < test_score: max_test_epoch = idx max_test = max(test_score, max_test) if dev_score > best_dev: print("Exceed previous best dev score") best_test = test_score best_dev = dev_score max_dev_epoch = idx if data.save_model: model_name = data.model_dir + "/best_model.ckpt" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) print( "Score summary: max dev (%d): %.4f, test: %.4f; max test (%d): %.4f" % (max_dev_epoch, best_dev, best_test, max_test_epoch, max_test)) gc.collect()
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" # 存储data数据 data.save(save_data_name) model = SeqModel(data) # check to load pretrained model if data.use_crf: pretrain_model_path = os.path.join('model_snapshot', 'lan_crf.model') else: pretrain_model_path = os.path.join('model_snapshot', 'lan.model') if data.use_pre_trained_model and os.path.exists(pretrain_model_path): model.load_state_dict(torch.load(pretrain_model_path)) print("load pretrained model success:%s" % pretrain_model_path) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("--------pytorch total params--------") print(pytorch_total_params) optimizer = None if data.optimizer.lower() == "sgd": optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_dev = -10 best_test = -10 no_imprv_epoch = 0 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) # print (self.train_Ids) # every 5 epoch decay learning rate if idx % 5 == 0: optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 total_loss = 0 ## set model in train model model.train() model.zero_grad() start = 0 end = start + data.HP_batch_size train_epochs = [] while end <= len(data.train_Ids): train_epochs.append((start, end)) start = end end = end + data.HP_batch_size if end > len(data.train_Ids) > start: train_epochs.append((start, len(data.train_Ids))) for sample_id, (start, end) in enumerate(train_epochs): instance = data.train_Ids[start: end] sample_loss = 0 batch_word, batch_word_len, _, batch_word_recover, batch_label, mask, input_label_seq_tensor = batchify_with_label( instance, data.HP_gpu, data) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_word_len, batch_label, mask, input_label_seq_tensor) sample_loss += loss.item() total_loss += loss.item() print("Epoch:%s,no_imprv_epoch:%s,Instance: %s" % ( idx, no_imprv_epoch, sample_id)) right, whole = predict_check(tag_seq, batch_label, mask, data.use_crf) print(" loss: %.4f, acc: %s/%s=%.4f" % ( loss.item(), right, whole, (right + 0.) / whole * 100)) if sample_loss > 1e8 or str(sample_loss) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) sys.stdout.flush() loss.backward() if data.whether_clip_grad: nn.utils.clip_grad_norm_(model.parameters(), data.clip_grad) optimizer.step() model.zero_grad() # break epoch_finish = time.time() if total_loss > 1e8 or str(total_loss) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) speed, acc, report, f_value, \ ner_acc, ner_p, ner_r, ner_f = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f_value # current_score = sent_f1 print("Dev: time: %.2fs, speed: %.2fst/s;\n" "acc: %.4f, f_value: %.4f\n" "ner_acc: %.4f, ner_p: %.4f, ner_r: %.4f, ner_f: %.4f\n" "current f1:%.4f" % ( dev_cost, speed, acc, f_value, ner_acc, ner_p, ner_r, ner_f, current_score )) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % ( dev_cost, speed, acc)) # ## decode test speed, acc, report, f_value, \ ner_acc, ner_p, ner_r, ner_f = evaluate(data, model, "test") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: print("Test: time: %.2fs, speed: %.2fst/s;\n" "acc: %.4f, f_value: %.4f\n" "ner_acc: %.4f, ner_p: %.4f, ner_r: %.4f, ner_f: %.4f\n" "current f1:%.4f" % ( dev_cost, speed, acc, f_value, ner_acc, ner_p, ner_r, ner_f, current_score )) else: print("Test: time: %.2fs speed: %.2fst/s; acc: %.4f" % ( dev_cost, speed, acc)) if current_score > best_dev: if data.seg: best_test = f_value # best_test = sent_f1 print("Exceed previous best avg f score:", best_dev) else: best_test = acc print("Exceed previous best acc score:", best_dev) if data.use_crf: result_file = "result_crf.txt" model_name = data.model_dir + "_crf.model" else: result_file = "result.txt" model_name = data.model_dir + ".model" with open(result_file, 'w', encoding='utf-8') as w: w.write( "Save current best model in file:%s, iteration:%s/%s, best_test_f_score:%.5f\n" "ner:\n" " precision:%.5f, recall:%.5f, f1_score:%.5f\n" "%s\n\n" % ( model_name, idx, data.HP_iteration, best_test, ner_p, ner_r, ner_f, report)) print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score no_imprv_epoch = 0 else: # early stop no_imprv_epoch += 1 if no_imprv_epoch >= 10: print("early stop") print("Current best f score in dev", best_dev) print("Current best f score in test", best_test) break if data.seg: print("Current best f score in dev", best_dev) print("Current best f score in test", best_test) else: print("Current best acc score in dev", best_dev) print("Current best acc score in test", best_test) gc.collect()