def main(): # ====== preprocess ====== # args = preprocess() # ====== Loading dataset ====== # train_data, dev_data, test_data, joint_vocabs, parsing_vocabs = load_data( args.joint_input, args.parsing_input, args.batch_size, args.accum_steps, args.shuffle, args.num_workers, args.drop_last) # cross_labels_idx = generate_cross_labels_idx(vocabs['labels']) # ======= Preparing Model ======= # print("\nModel Preparing starts...") model = JointEncoderModel( joint_vocabs, parsing_vocabs, # cross_labels_idx, # Embedding args.subword, args.use_pos_tag, args.bert_path, args.transliterate, args.d_model, args.partition, args.pos_tag_emb_dropout, args.position_emb_dropout, args.bert_emb_dropout, args.emb_dropout, # Encoder args.layer_num, args.hidden_dropout, args.attention_dropout, args.dim_ff, args.nhead, args.kqv_dim, # classifier args.label_hidden, # loss args.lambda_scaler, args.alpha_scaler, args.language, args.device).cuda() # print(model, end='\n\n\n') optimizer = Optim(model, args.optim, args.lr, args.lr_fine_tune, args.warmup_steps, args.lr_decay_factor, args.weight_decay, args.clip_grad, args.clip_grad_max_norm) optimizer.zero_grad() # if args.freeze_bert: # optimizer.set_freeze_by_idxs([str(num) for num in range(0, config.freeze_bert_layers)], True) # optimizer.free_embeddings() # optimizer.freeze_pooler() # print('freeze model of BERT %d layers' % config.freeze_bert_layers) # ========= Training ========= # print('Training starts...') start = time.time() steps, loss_value, total_batch_size = 1, 0., 0 best_dev, best_test = None, None patience = args.patience for epoch_i in range(1, args.epoch): for batch_i, insts in enumerate(train_data, start=1): model.train() insts, batch_size, max_len = batch_filter( insts, args.language, args.DATASET_MAX_SNT_LENGTH) insts_list = batch_spliter(insts, max_len, args.BATCH_MAX_SNT_LENGTH) total_batch_size += batch_size for insts in insts_list: loss = model(insts) if loss.item() > 0.: loss.backward() loss_value += loss.item() assert not isinstance(loss_value, torch.Tensor), 'GPU memory leak' if batch_i == args.accum_steps and not args.debug: args.visual_logger.visual_histogram(model, steps // args.accum_steps) if steps % args.accum_steps == 0: optimizer.step() optimizer.zero_grad() if steps % (args.accum_steps * args.log_interval) == 0: print('[%d/%d], [%d/%d] Loss: %.05f' % (epoch_i, args.epoch, batch_i // args.accum_steps, len(train_data) // args.accum_steps, loss_value / total_batch_size), flush=True) visual_dic = { 'loss/train': loss_value, 'lr': optimizer.get_lr()[0] } if args.clip_grad: visual_dic['norm'] = optimizer.get_dynamic_gard_norm() if not args.debug: args.visual_logger.visual_scalars( visual_dic, steps // args.accum_steps) loss_value, total_batch_size = 0., 0 torch.cuda.empty_cache() if steps % (args.accum_steps * args.eval_interval) == 0: print('model evaluating starts...', flush=True) joint_fscore_dev, res_data_dev = eval_model( model, dev_data, args.language, args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH, args.evalb_path, 'dev') joint_fscore_test, res_data_test = eval_model( model, test_data, args.language, args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH, args.evalb_path, 'test') visual_dic = { 'F/parsing_dev': joint_fscore_dev.parsing_f, 'F/parsing_test': joint_fscore_test.parsing_f, 'F/ner_dev': joint_fscore_dev.ner_f, 'F/ner_test': joint_fscore_test.ner_f } if not args.debug: args.visual_logger.visual_scalars( visual_dic, steps // args.accum_steps) if best_dev is None or joint_fscore_dev.parsing_f > best_dev.parsing_f: best_dev, best_test = joint_fscore_dev, joint_fscore_test fitlog.add_best_metric({ 'parsing_f_dev': best_dev.parsing_f, 'ner_f_test': best_test.ner_f }) patience = args.patience write_joint_data(args.save_path, res_data_dev, 'dev') write_joint_data(args.save_path, res_data_test, 'test') if args.save: torch.save( model.pack_state_dict(), os.path.join(args.save_path, args.name + '.best.model.pt')) print('best performance:\ndev: %s\ntest: %s' % (best_dev, best_test)) print('model evaluating ends...', flush=True) del res_data_dev, res_data_test if args.debug: exit(0) steps += 1 if args.early_stop: patience -= 1 if patience < 0: print('early stop') break # ====== postprocess ====== # postprocess(args, start)
def main(): config = parse_args() # ========= Loading Dataset ========= # print(config) print("Loading dataset starts...") train_data, dev_data, test_data, train_dataset = load_data(config) print('\n\n', end='') # ========= Preparing Model ========= # print("Preparing Model starts...") if config.use_cuda and torch.cuda.is_available(): config.device = torch.device('cuda:' + str(config.cuda_id)) print('You will train model in cuda: %d.\n' % config.device.index) else: config.device = torch.device('cpu') print('GPU is not available, use CPU default.\n') pretra_char_embed, pretra_bichar_embed = load_pretrained_embeddings( train_dataset, config) model = ParaNNTranSegmentor( pretra_char_embed, train_dataset.get_char_vocab_size(), config.char_embed_dim, config.char_embed_dim_no_static, config.char_embed_max_norm, pretra_bichar_embed, train_dataset.get_bichar_vocab_size(), config.bichar_embed_dim, config.bichar_embed_dim_no_static, config.bichar_embed_max_norm, config.dropout_embed, config.encoder_embed_dim, config.dropout_encoder_embed, config.encoder_lstm_hid_size, config.dropout_encoder_hid, config.subword_lstm_hid_size, config.word_lstm_hid_size, config.device) if config.use_cuda and torch.cuda.is_available(): model.to(config.device) print(model, end='\n\n\n') criterion = torch.nn.CrossEntropyLoss(reduction='sum').to(config.device) optimizer = Optim(config.opti_name, config.learning_rate, config.weight_decay, model, config) visual_logger = VisualLogger(config.visual_logger_path) # ========= Training ========= # print('Training starts...') start = time.time() total_loss, golds_words, pred_words, seg_words, chars, cor_chars, steps = 0.0, 0, 0, 0, 0, 0, 1 best_perf = [0, 0, 0., 0.] # (epoch_idx, batch_idx, F_dev, F_test) for epoch_i in range(config.epoch): for batch_i, (insts, golds) in enumerate(train_data): insts = list(map(lambda x: x.to(config.device), insts)) golds = golds.to(config.device) model.train() optimizer.zero_grad() pred = model(insts, golds) loss, golds_word, pred_word, seg_word, char, cor_char = cal_preformance( pred, golds, criterion, config.device) total_loss += loss.item() golds_words += golds_word pred_words += pred_word seg_words += seg_word chars += char cor_chars += cor_char loss.backward() optimizer.step() if steps % config.logInterval == 0: avg_loss = total_loss / chars P = seg_words / pred_words R = seg_words / golds_words F = (2 * P * R) / (P + R) print( '[%d/%d], [%d/%d] Loss: %.05f, F: %.05f, P: %.05f, R: %.05f' % (epoch_i + 1, config.epoch, batch_i + 1, len(train_data), avg_loss, F, P, R)) sys.stdout.flush() scal = { 'Loss': avg_loss, 'F': F, 'P': P, 'R': R, 'lr': optimizer.get_lr()[0] } visual_logger.visual_scalars(scal, steps, 'train') total_loss, golds_words, pred_words, seg_words, chars, cor_chars = 0.0, 0, 0, 0, 0, 0 # break if steps % config.valInterval == 0: F_dev, F_test = eval_model(model, criterion, dev_data, test_data, config.device, visual_logger, steps) if F_dev > best_perf[2]: best_perf[0], best_perf[1], best_perf[2], best_perf[ 3] = epoch_i + 1, batch_i + 1, F_dev, F_test print( 'best performance: [%d/%d], [%d/%d], F_dev: %.05f, F_test: %.05f.' % (best_perf[0], config.epoch, best_perf[1], len(train_data), best_perf[2], best_perf[3])) sys.stdout.flush() if steps % config.visuParaInterval == 1: visual_logger.visual_histogram(model, steps) if steps % config.saveInterval == 0: if not os.path.exists(config.save_path): os.mkdir(config.save_path) filename = '%d.model' % steps modelpath = os.path.join(config.save_path, filename) torch.save(model, modelpath) steps += 1 exe_time = time.time() - start print('Executing time: %dh:%dm:%ds.' % (exe_time / 3600, (exe_time / 60) % 60, exe_time % 60)) visual_logger.close() print('Training ends.')
def main(): config = parse_args() set_seed(config.seed) # ========= Loading Dataset ========= # print(config) print("Loading dataset starts...") train_data, dev_data, test_data, train_dataset = load_data(config) print('\n\n', end='') # ========= Preparing Model ========= # print("Preparing Model starts...") if config.use_cuda and torch.cuda.is_available(): config.device = torch.device('cuda:' + str(config.cuda_id)) print('You will train model in cuda: %d.\n' % config.device.index) else: config.device = torch.device('cpu') print('GPU is not available, use CPU default.\n') model = Bert3Gram(config.device, config.cache_3gram_path) if config.use_cuda and torch.cuda.is_available(): model.to(config.device) print(model, end='\n\n\n') criterion = torch.nn.CrossEntropyLoss(reduction='sum').to(config.device) optimizer = Optim(model, config) visual_logger = VisualLogger(config.visual_logger_path) # ========= Training ========= # print('Training starts...') start = time.time() total_loss, golds_words, pred_words, seg_words, chars, cor_chars, steps = 0.0, 0, 0, 0, 0, 0, 1 best_perf = [0, 0, 0., 0.] # (epoch_idx, batch_idx, F_dev, F_test) if config.freeze_bert: optimizer.set_freeze_by_idxs( [str(num) for num in range(0, config.freeze_bert_layers)], True) optimizer.free_embeddings() optimizer.freeze_pooler() for epoch_i in range(config.epoch): for batch_i, [insts, golds] in enumerate(train_data): golds = golds.to(config.device) model.train() pred = model(insts, golds) loss, golds_word, pred_word, seg_word, char, cor_char = cal_preformance( pred, golds, criterion, config.device) total_loss += loss.item() golds_words += golds_word pred_words += pred_word seg_words += seg_word chars += char cor_chars += cor_char loss.backward() if steps % config.accumulation_steps == 0: optimizer.step() optimizer.zero_grad() torch.cuda.empty_cache() if steps % config.logInterval == 0: avg_loss = total_loss / chars P = seg_words / pred_words R = seg_words / golds_words F = (2 * P * R) / (P + R) print( '[%d/%d], [%d/%d] Loss: %.05f, F: %.05f, P: %.05f, R: %.05f' % (epoch_i + 1, config.epoch, batch_i + 1, len(train_data), avg_loss, F, P, R)) sys.stdout.flush() scal = { 'Loss': avg_loss, 'F': F, 'P': P, 'R': R, 'lr': optimizer.get_lr()[0] } visual_logger.visual_scalars(scal, steps, 'train') total_loss, golds_words, pred_words, seg_words, chars, cor_chars = 0.0, 0, 0, 0, 0, 0 # break if steps % config.valInterval == 0: F_dev, F_test = eval_model(model, criterion, dev_data, test_data, config.device, visual_logger, steps) if F_dev > best_perf[2]: best_perf[0], best_perf[1], best_perf[2], best_perf[ 3] = epoch_i + 1, batch_i + 1, F_dev, F_test print( 'best performance: [%d/%d], [%d/%d], F_dev: %.05f, F_test: %.05f.' % (best_perf[0], config.epoch, best_perf[1], len(train_data), best_perf[2], best_perf[3])) sys.stdout.flush() optimizer.zero_grad() torch.cuda.empty_cache() # torch.save(model.pack_state_dict(), os.path.join(config.save_path, 'cnn.pt')) if steps % config.visuParaInterval == 1: visual_logger.visual_histogram(model, steps) if steps % config.saveInterval == 0: if not os.path.exists(config.save_path): os.mkdir(config.save_path) filename = '%d.model' % steps modelpath = os.path.join(config.save_path, filename) torch.save(model, modelpath) steps += 1 exe_time = time.time() - start print('Executing time: %dh:%dm:%ds.' % (exe_time / 3600, (exe_time / 60) % 60, exe_time % 60)) visual_logger.close() print('Training ends.')
def main(): # ====== preprocess ====== # args = preprocess() # ====== Loading dataset ====== # train_data, dev_data, subtree_vocab, token_vocab = load_data( args.input, args.batch_size, args.language, args.subword, args.debug) # ======= Preparing Model ======= # print("\nModel Preparing starts...") model = PretrainModel( subtree_vocab, token_vocab, # Embedding args.subword, args.bert, args.transliterate, args.d_model, args.partition, args.position_emb_dropout, args.bert_emb_dropout, args.emb_dropout, args.layer_num, args.hidden_dropout, args.attention_dropout, args.dim_ff, args.nhead, args.kqv_dim, args.label_hidden, # classifier args.language, args.device).cuda() # print(model, end='\n\n\n') optimizer = Optim(model, args.optim, args.lr, args.lr_fine_tune, args.warmup_steps, args.lr_decay_factor, args.weight_decay, args.clip_grad, args.clip_grad_max_norm) optimizer.zero_grad() # if args.freeze_bert: # optimizer.set_freeze_by_idxs([str(num) for num in range(0, config.freeze_bert_layers)], True) # optimizer.free_embeddings() # optimizer.freeze_pooler() # print('freeze model of BERT %d layers' % config.freeze_bert_layers) # ========= Training ========= # print('Training starts...') start = time.time() steps, loss_value, total_batch_size = 1, 0., 0 best_dev = 0. patience = args.patience total_subtree, tp_subtree = 0, 0 total_head, tp_head = 0, 0 total_mask_lm, tp_mask_lm = 0, 0 for epoch_i in range(1, args.epoch + 1): for batch_i, insts in enumerate(train_data, start=1): model.train() insts, batch_size, max_len = batch_filter( insts, args.DATASET_MAX_SNT_LENGTH) insts_list = batch_spliter(insts, max_len, args.BATCH_MAX_SNT_LENGTH) total_batch_size += batch_size for insts in insts_list: loss, b_s, b_s_tp, b_h, b_h_tp, b_m, b_m_tp = model(insts) total_subtree += b_s tp_subtree += b_s_tp total_head += b_h tp_head += b_h_tp total_mask_lm += b_m tp_mask_lm += b_m_tp if loss.item() > 0.: loss.backward() loss_value += loss.item() assert not isinstance(loss_value, torch.Tensor), 'GPU memory leak' if steps % args.accum_steps == 0: optimizer.step() optimizer.zero_grad() if steps % (args.accum_steps * args.log_interval) == 0: print( '[%d/%d], [%d/%d] Loss: %.05f, subtree_acc: %.03f, head_acc: %.03f, mask_lm: %.03f, total_acc: %.03f' % (epoch_i, args.epoch, batch_i // args.accum_steps, len(train_data) // args.accum_steps, loss_value / total_batch_size, tp_subtree / total_subtree * 100, tp_head / total_head * 100, tp_mask_lm / total_mask_lm * 100, (tp_subtree + tp_head + tp_mask_lm) / (total_subtree + total_head + total_mask_lm) * 100), flush=True) loss_value, total_batch_size = 0., 0 total_subtree, tp_subtree = 0, 0 total_head, tp_head = 0, 0 total_mask_lm, tp_mask_lm = 0, 0 torch.cuda.empty_cache() if steps % (args.accum_steps * args.eval_interval) == 0: patience -= 1 print('model evaluating starts...', flush=True) dev_acc = eval_model(model, dev_data, args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH, 'dev') if best_dev < dev_acc: best_dev = dev_acc patience = args.patience model.save_models( os.path.join(args.save_path, 'best.model/')) print('best performance: ACC: %.03f' % (best_dev)) print('model evaluating ends...', flush=True) if args.early_stop: if patience < 0: break if steps % (args.accum_steps * args.save_interval) == 0: model.save_models( os.path.join( args.save_path, str(steps / args.accum_steps) + '.steps.model/')) steps += 1 if args.early_stop: if patience < 0: print('early stop') break # ====== postprocess ====== # postprocess(args, start)