def main(): args = load_args() ss = load_server_socket() if args.need_score_traces and args.beam_size <= 1: raise ValueError( "Score trace is only available for beam search with beam size > 1." ) if args.max_tgt_length >= args.max_seq_length - 2: raise ValueError("Maximum tgt length exceeds max seq length - 2.") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer.max_len = args.max_seq_length pair_num_relation = 0 bi_uni_pipeline = [] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seqDecoder( list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, max_tgt_length=args.max_tgt_length, new_segment_ids=args.new_segment_ids, mode="s2s", num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"]) def _get_token_id_set(s): r = None if s: w_list = [] for w in s.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) r = set(tokenizer.convert_tokens_to_ids(w_list)) return r forbid_ignore_set = _get_token_id_set(args.forbid_ignore_word) not_predict_set = _get_token_id_set(args.not_predict_token) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=pair_num_relation, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, not_predict_set=not_predict_set, ngram_size=args.ngram_size, min_len=args.min_len, mode=args.mode, max_position_embeddings=args.max_seq_length, ffn_type=args.ffn_type, num_qkv=args.num_qkv, seg_emb=args.seg_emb, pos_shift=args.pos_shift) del model_recover if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) torch.cuda.empty_cache() model.eval() max_src_length = args.max_seq_length - 2 - args.max_tgt_length DONE_SIGNAL = 42 # bytescode of asterisk '*' while True: ss.listen(100) print("Waiting Connection:") cs, addr = ss.accept() data = bytes() while True: recv = cs.recv(1024) if 0 < len(recv) and DONE_SIGNAL == recv[-1]: data += recv[:len(recv) - 1] break data += recv print("Connection with:", addr) print("Received:", len(data)) input_lines = [ x.strip() for x in data.decode('utf-8').splitlines() ] if args.subset > 0: logger.info("Decoding subset: %d", args.subset) input_lines = input_lines[:args.subset] data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer input_lines = [ data_tokenizer.tokenize(x)[:max_src_length] for x in input_lines ] input_lines = list(enumerate(input_lines)) output_lines = [""] * len(input_lines) score_trace_list = [None] * len(input_lines) next_i = 0 while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[1] for x in _chunk] next_i += args.batch_size max_a_len = max([len(x) for x in buf]) instances = [] for instance in [(x, max_a_len) for x in buf]: for proc in bi_uni_pipeline: instances.append(proc(instance)) with torch.no_grad(): batch = seq2seq_loader.batch_list_to_batch_tensors( instances) batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, token_type_ids, position_ids, input_mask, mask_qkv, task_idx = batch traces = model(input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx, mask_qkv=mask_qkv) if args.beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces['pred_seq'] else: output_ids = traces.tolist() qg_result = [] for i in range(len(buf)): w_ids = output_ids[i] output_buf = tokenizer.convert_ids_to_tokens(w_ids) output_tokens = [] for t in output_buf: if t in ("[SEP]", "[PAD]"): break output_tokens.append(t) output_sequence = ' '.join(detokenize(output_tokens)) qg_result.append(output_sequence) if args.need_score_traces: score_trace_list[buf_id[i]] = { 'scores': traces['scores'][i], 'wids': traces['wids'][i], 'ptrs': traces['ptrs'][i] } cs.sendall(ascii_print('\n'.join(qg_result))) cs.close() if args.output_file: fn_out = args.output_file else: fn_out = model_recover_path + '.' + args.split with open(fn_out, "w", encoding="utf-8") as fout: for l in output_lines: fout.write(l) fout.write("\n") if args.need_score_traces: with open(fn_out + ".trace.pickle", "wb") as fout_trace: pickle.dump( { "version": 0.0, "num_samples": len(input_lines) }, fout_trace) for x in score_trace_list: pickle.dump(x, fout_trace)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("segm_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) model = build_model(cfg, jpu=cfg.MODEL.JPU, lateral=cfg.MODEL.LATERAL) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision) amp_opt_level = 'O1' if use_mixed_precision else 'O0' model = amp.initialize(model, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True, ) output_dir = cfg.OUTPUT_DIR checkpointer = Checkpointer(model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) dataset_name = cfg.DATA.DATASET if cfg.OUTPUT_DIR: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) data_loader_val = make_data_loader(cfg, stage='test', is_distributed=distributed) inference( model, data_loader_val, dataset_name=dataset_name, device=cfg.MODEL.DEVICE, output_folder=output_folder, ) synchronize()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") # decoding parameters parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument("--input_file", type=str, help="Input file") parser.add_argument('--subset', type=int, default=0, help="Decode a subset of the input dataset.") parser.add_argument("--output_file", type=str, help="output file") parser.add_argument("--split", type=str, default="", help="Data split (train/val/test).") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Ignore the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=None, type=int) parser.add_argument('--need_score_traces', action='store_true') parser.add_argument('--ngram_size', type=int, default=3) parser.add_argument('--mode', default="s2s", choices=["s2s", "l2r", "both"]) parser.add_argument('--max_tgt_length', type=int, default=128, help="maximum length of target sequence") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") parser.add_argument('--not_predict_token', type=str, default=None, help="Do not predict the tokens during decoding.") args = parser.parse_args() if args.need_score_traces and args.beam_size <= 1: raise ValueError( "Score trace is only available for beam search with beam size > 1." ) if args.max_tgt_length >= args.max_seq_length - 2: raise ValueError("Maximum tgt length exceeds max seq length - 2.") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer.max_len = args.max_seq_length pair_num_relation = 0 bi_uni_pipeline = [] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seqDecoder( list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, max_tgt_length=args.max_tgt_length, new_segment_ids=args.new_segment_ids, mode="s2s", num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"]) def _get_token_id_set(s): r = None if s: w_list = [] for w in s.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) r = set(tokenizer.convert_tokens_to_ids(w_list)) return r forbid_ignore_set = _get_token_id_set(args.forbid_ignore_word) not_predict_set = _get_token_id_set(args.not_predict_token) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=pair_num_relation, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, not_predict_set=not_predict_set, ngram_size=args.ngram_size, min_len=args.min_len, mode=args.mode, max_position_embeddings=args.max_seq_length, ffn_type=args.ffn_type, num_qkv=args.num_qkv, seg_emb=args.seg_emb, pos_shift=args.pos_shift) del model_recover if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) torch.cuda.empty_cache() model.eval() next_i = 0 max_src_length = args.max_seq_length - 2 - args.max_tgt_length with open(args.input_file, encoding="utf-8") as fin: input_lines = [x.strip() for x in fin.readlines()] if args.subset > 0: logger.info("Decoding subset: %d", args.subset) input_lines = input_lines[:args.subset] data_tokenizer = tokenizer input_lines = [ data_tokenizer.tokenize(x)[:max_src_length] for x in input_lines ] input_lines = sorted(list(enumerate(input_lines)), key=lambda x: -len(x[1])) output_lines = [""] * len(input_lines) score_trace_list = [None] * len(input_lines) total_batch = math.ceil(len(input_lines) / args.batch_size) with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[1] for x in _chunk] next_i += args.batch_size max_a_len = max([len(x) for x in buf]) instances = [] for instance in [(x, max_a_len) for x in buf]: for proc in bi_uni_pipeline: instances.append(proc(instance)) with torch.no_grad(): batch = seq2seq_loader.batch_list_to_batch_tensors( instances) batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, token_type_ids, position_ids, input_mask, mask_qkv, task_idx = batch traces = model(input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx, mask_qkv=mask_qkv) if args.beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces['pred_seq'] else: output_ids = traces.tolist() for i in range(len(buf)): w_ids = output_ids[i] output_buf = tokenizer.convert_ids_to_tokens(w_ids) output_tokens = [] for t in output_buf: if t in ("[SEP]", "[PAD]"): break output_tokens.append(t) output_sequence = ' '.join(detokenize(output_tokens)) output_lines[buf_id[i]] = output_sequence if args.need_score_traces: score_trace_list[buf_id[i]] = { 'scores': traces['scores'][i], 'wids': traces['wids'][i], 'ptrs': traces['ptrs'][i] } pbar.update(1) if args.output_file: fn_out = args.output_file else: fn_out = model_recover_path + '.' + args.split with open(fn_out, "w", encoding="utf-8") as fout: for l in output_lines: fout.write(l) fout.write("\n") if args.need_score_traces: with open(fn_out + ".trace.pickle", "wb") as fout_trace: pickle.dump({ "version": 0.0, "num_samples": len(input_lines) }, fout_trace) for x in score_trace_list: pickle.dump(x, fout_trace)
def predict(): """ Predicts file directory with network specified by files to output path """ import numpy as np import torch from tqdm import tqdm import os from matplotlib import pyplot as plt from ..utils import Config from ..layer import HomogeneousShapeLayer from ..networks import SingleShapeNetwork from shapedata.single_shape import SingleShapeDataProcessing, \ SingleShapeSingleImage2D from shapedata.io import pts_exporter import argparse parser = argparse.ArgumentParser() parser.add_argument("-v", "--visualize", action="store_true", help="If Flag is specified, results will be plotted") parser.add_argument("-d", "--in_path", type=str, help="Input Data Dir") parser.add_argument("-s", "--out_path", default="./outputs", type=str, help="Output Data Dir") parser.add_argument("-w", "--weight_file", type=str, help="Model Weights") parser.add_argument("-c", "--config_file", type=str, help="Configuration") args = parser.parse_args() config = Config() config_dict = config(os.path.abspath(args.config_file)) try: net = torch.jit.load(os.path.abspath(args.weight_file)) net.eval() net.cpu() except RuntimeError: net_layer = HomogeneousShapeLayer if config_dict["training"].pop("mixed_prec", False): try: from apex import amp amp.init() except: pass shapes = np.load( os.path.abspath(config_dict["layer"].pop("pca_path")) )["shapes"][:config_dict["layer"].pop("num_shape_params") + 1] net = SingleShapeNetwork(net_layer, { "shapes": shapes, **config_dict["layer"] }, img_size=config_dict["data"]["img_size"], **config_dict["network"]) state = torch.load(os.path.abspath(args.weight_file)) try: net.load_state_dict(state["state_dict"]["model"]) except KeyError: try: net.load_state_dict(state["model"]) except KeyError: net.load_state_dict(state) net = net.to("cpu") net = net.eval() data = SingleShapeDataProcessing._get_files(os.path.abspath(args.in_path), extensions=[".png", ".jpg"]) def process_sample(sample, img_size, net, device, crop=0.1): lmk_bounds = sample.get_landmark_bounds(sample.lmk) min_y, min_x, max_y, max_x = lmk_bounds range_x = max_x - min_x range_y = max_y - min_y max_range = max(range_x, range_y) * (1 + crop) center_x = min_x + range_x / 2 center_y = min_y + range_y / 2 tmp = sample.crop(center_y - max_range / 2, center_x - max_range / 2, center_y + max_range / 2, center_x + max_range / 2) img_tensor = torch.from_numpy(tmp.to_grayscale().resize( (img_size, img_size)).img.transpose(2, 0, 1)).to( torch.float).unsqueeze(0).to(device) pred = net(img_tensor).cpu().numpy()[0] pred = pred * np.array([max_range / img_size, max_range / img_size]) pred = pred + np.asarray( [center_y - max_range / 2, center_x - max_range / 2]) return pred device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): if torch.cuda.is_available(): net = net.cuda() if args.visualize: pred_path = os.path.join(os.path.abspath(args.out_path), "pred") vis_path = os.path.join(os.path.abspath(args.out_path), "visualization") os.makedirs(vis_path, exist_ok=True) else: pred_path = os.path.abspath(args.out_path) os.makedirs(pred_path, exist_ok=True) for idx, file in enumerate(tqdm(data)): _data = SingleShapeSingleImage2D.from_files(file) pred = process_sample(_data, img_size=config_dict["data"]["img_size"], net=net, device=device) fname = os.path.split(_data.img_file)[-1].rsplit(".", 1)[0] if args.visualize: view_kwargs = {} if _data.is_gray: view_kwargs["cmap"] = "gray" fig = _data.view(True, **view_kwargs) plt.gca().scatter(pred[:, 1], pred[:, 0], s=5, c="C1") plt.gca().legend(["GT", "Pred"]) plt.gcf().savefig(os.path.join(vis_path, fname + ".png")) plt.close() _data.save(pred_path, fname, "PTS") pts_exporter(pred, os.path.join(pred_path, fname + "_pred.pts"))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--dev_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--dev_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--ks_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--ks_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--ks_dev_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--ks_dev_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--predict_input_file", default=None, type=str, help="predict_input_file") parser.add_argument("--predict_output_file", default=None, type=str, help="predict_output_file") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") parser.add_argument("--predict_bleu", default=0.5, type=float, help="The Predicted Bleu for KS Predict ") parser.add_argument("--train_vae", action='store_true', help="Whether to train vae.") # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run ks predict.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 16-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument( "--mask_prob_eos", default=0, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument( '--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() assert Path( args.model_recover_path).exists(), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) handler = logging.FileHandler(os.path.join(args.log_dir, "train.log"), encoding='UTF-8') handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) console = logging.StreamHandler() console.setLevel(logging.DEBUG) logger.addHandler(handler) logger.addHandler(console) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() print("Loading QKR Train Dataset", args.data_dir) bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] file_oracle = None if args.has_sentence_oracle: file_oracle = os.path.join(args.data_dir, 'train.oracle') fn_src = os.path.join(args.data_dir, args.src_file if args.src_file else 'train.src') fn_tgt = os.path.join(args.data_dir, args.tgt_file if args.tgt_file else 'train.tgt') train_dataset = seq2seq_loader.Seq2SeqDataset( fn_src, fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset, replacement=False) _batch_size = args.train_batch_size else: train_sampler = DistributedSampler(train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) print("Loading KS Train Dataset", args.data_dir) ks_fn_src = os.path.join(args.data_dir, args.ks_src_file) ks_fn_tgt = os.path.join(args.data_dir, args.ks_tgt_file) ks_train_dataset = seq2seq_loader.Seq2SeqDataset( ks_fn_src, ks_fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: ks_train_sampler = RandomSampler(ks_train_dataset, replacement=False) _batch_size = args.train_batch_size else: ks_train_sampler = DistributedSampler(ks_train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() ks_train_dataloader = torch.utils.data.DataLoader( ks_train_dataset, batch_size=_batch_size, sampler=ks_train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) logger.info("Loading QKR Eval Dataset from {}".format(args.data_dir)) fn_src = os.path.join(args.data_dir, args.dev_src_file) fn_tgt = os.path.join(args.data_dir, args.dev_tgt_file) dev_reddit_dataset = seq2seq_loader.Seq2SeqDataset( fn_src, fn_tgt, args.eval_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: dev_reddit_sampler = RandomSampler(dev_reddit_dataset, replacement=False) _batch_size = args.eval_batch_size else: dev_reddit_sampler = DistributedSampler(dev_reddit_dataset) _batch_size = args.eval_batch_size // dist.get_world_size() dev_reddit_dataloader = torch.utils.data.DataLoader( dev_reddit_dataset, batch_size=_batch_size, sampler=dev_reddit_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) logger.info("Loading KS Eval Dataset from {}".format(args.data_dir)) ks_dev_fn_src = os.path.join(args.data_dir, args.ks_dev_src_file) ks_dev_fn_tgt = os.path.join(args.data_dir, args.ks_dev_tgt_file) ks_dev_reddit_dataset = seq2seq_loader.Seq2SeqDataset( ks_dev_fn_src, ks_dev_fn_tgt, args.eval_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: ks_dev_reddit_sampler = RandomSampler(ks_dev_reddit_dataset, replacement=False) _batch_size = args.eval_batch_size else: ks_dev_reddit_sampler = DistributedSampler(ks_dev_reddit_dataset) _batch_size = args.eval_batch_size // dist.get_world_size() ks_dev_reddit_dataloader = torch.utils.data.DataLoader( ks_dev_reddit_dataset, batch_size=_batch_size, sampler=ks_dev_reddit_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size) t_total = int( len(train_dataloader) * args.num_train_epochs / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) global_step = 0 else: if recover_step: logger.info(" ** ** * Recover model: %d ** ** * ", recover_step) model_recover = torch.load(os.path.join( args.output_dir, "model.{0}.bin".format(recover_step)), map_location='cpu') # recover_step == number of epochs global_step = math.floor(recover_step * t_total / args.num_train_epochs) elif args.model_recover_path: logger.info(" ** ** * Recover model: %s ** ** * ", args.model_recover_path) model_recover = torch.load(args.model_recover_path, map_location='cpu') global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if recover_step: logger.info(" ** ** * Recover optimizer: %d ** ** * ", recover_step) optim_recover = torch.load(os.path.join( args.output_dir, "optim.{0}.bin".format(recover_step)), map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info( " ** ** * Recover optimizer: dynamic_loss_scale ** ** * ") optimizer.dynamic_loss_scale = True logger.info(" ** ** * CUDA.empty_cache() ** ** * ") torch.cuda.empty_cache() if args.do_train: KL_weight = 0.0 logger.info(" ** ** * Running training ** ** * ") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, int(args.num_train_epochs) + 1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) step = 0 for batch, ks_batch in zip(train_dataloader, ks_train_dataloader): batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=labels, ks_labels=ks_labels, train_vae=args.train_vae) if args.train_vae: masked_lm_loss, next_sentence_loss, KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() KL_loss = KL_loss.mean() else: masked_lm_loss, next_sentence_loss, _ = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() KL_weight += 1.0 / float(len(ks_train_dataloader)) if args.train_vae: loss = masked_lm_loss + next_sentence_loss + KL_weight * KL_loss else: loss = masked_lm_loss + next_sentence_loss logger.info("In{}step, masked_lm_loss:{}".format( step, masked_lm_loss)) logger.info("In{}step, KL_weight:{}".format(step, KL_weight)) #logger.info("In{}step, KL_loss:{}".format(step, KL_loss)) logger.info("******************************************* ") # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if random.randint(0, 0) == 0: ks_batch = [ t.to(device) if t is not None else None for t in ks_batch ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = ks_batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=labels, ks_labels=ks_labels, train_ks=True, train_vae=args.train_vae) if args.train_vae: ks_loss, KS_KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. ks_loss = ks_loss.mean() KS_KL_loss = KS_KL_loss.mean() loss = ks_loss + KL_weight * KS_KL_loss else: ks_loss, _ = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. ks_loss = ks_loss.mean() loss = ks_loss logger.info("In{}step, ks_loss:{}".format(step, ks_loss)) #logger.info("In{}step, KS_KL_loss:{}".format(step, KS_KL_loss)) logger.info("******************************************* ") # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step / t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 step += 1 if (step + 1) % 200 == 0: logger.info(" ** ** * Running QKR evaling ** ** * ") logger.info(" Batch size = %d", args.eval_batch_size) if args.local_rank != -1: train_sampler.set_epoch(i_epoch) dev_iter_bar = tqdm(dev_reddit_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) total_lm_loss = 0 total_kl_loss = 0 for qkr_dev_step, batch in enumerate(dev_iter_bar): batch = [ t.to(device) if t is not None else None for t in batch ] if args.has_sentence_oracle: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, oracle_pos, oracle_weights, oracle_labels = batch else: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = batch oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=labels, ks_labels=ks_labels, train_vae=args.train_vae) masked_lm_loss, next_sentence_loss, KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() KL_loss = KL_loss.mean() # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) dev_iter_bar.set_description('Iter (loss=%5.3f)' % masked_lm_loss.item()) total_lm_loss += masked_lm_loss.item() total_kl_loss += KL_loss.item() # ensure that accumlated gradients are normalized total_mean_lm_loss = total_lm_loss / (qkr_dev_step + 1) total_mean_kl_loss = total_kl_loss / (qkr_dev_step + 1) logger.info("** ** * Evaling mean loss ** ** * ") logger.info("In{}epoch,dev_lm_loss:{}".format( i_epoch, total_mean_lm_loss)) logger.info("In{}epoch,dev_kl_loss:{}".format( i_epoch, total_mean_kl_loss)) logger.info("******************************************* ") logger.info(" ** ** * Running KS evaling ** ** * ") logger.info(" Batch size = %d", args.eval_batch_size) ks_dev_iter_bar = tqdm(ks_dev_reddit_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) total_ks_loss = 0 total_ks_kl_loss = 0 for ks_dev_step, batch in enumerate(ks_dev_iter_bar): batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = batch oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=labels, ks_labels=ks_labels, train_ks=True, train_vae=args.train_vae) ks_loss, KS_KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() ks_loss = ks_loss.mean() KS_KL_loss = KS_KL_loss.mean() # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) ks_dev_iter_bar.set_description( 'Iter (loss=%5.3f)' % ks_loss.item()) total_ks_loss += ks_loss.item() total_ks_kl_loss += KS_KL_loss.item() total_mean_ks_loss = total_ks_loss / (ks_dev_step + 1) total_mean_ks_kl_loss = total_ks_kl_loss / (ks_dev_step + 1) logger.info("** ** * Evaling mean loss ** ** * ") logger.info("In{}epoch,dev_ks_loss:{}".format( i_epoch, total_mean_ks_loss)) logger.info("In{}epoch,dev_ks_kl_loss:{}".format( i_epoch, total_mean_ks_kl_loss)) total_mean_loss = total_mean_lm_loss + total_mean_kl_loss + total_mean_ks_loss + total_mean_ks_kl_loss logger.info("In{}epoch,dev_loss:{}".format( i_epoch, total_mean_loss)) logger.info("******************************************* ") # Save a trained model if (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * " ) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{}_{}_{}.bin".format( i_epoch, step, round(total_mean_loss, 4))) torch.save(model_to_save.state_dict(), output_model_file) output_optim_file = os.path.join( args.output_dir, "optim.bin") torch.save(optimizer.state_dict(), output_optim_file) logger.info(" ** ** * CUDA.empty_cache() ** ** * ") torch.cuda.empty_cache() if args.do_predict: bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq_predict( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] next_i = 0 model.eval() with open(os.path.join(args.data_dir, args.predict_input_file), "r", encoding="utf-8") as file: src_file = file.readlines() with open("train_tgt_pad.empty", "r", encoding="utf-8") as file: tgt_file = file.readlines() with open(os.path.join(args.data_dir, args.predict_output_file), "w", encoding="utf-8") as out: while next_i < len(src_file): print(next_i) batch_src = src_file[next_i:next_i + args.eval_batch_size] batch_tgt = tgt_file[next_i:next_i + args.eval_batch_size] next_i += args.eval_batch_size ex_list = [] for src, tgt in zip(batch_src, batch_tgt): src_tk = data_tokenizer.tokenize(src.strip()) tgt_tk = data_tokenizer.tokenize(tgt.strip()) ex_list.append((src_tk, tgt_tk)) batch = [] for idx in range(len(ex_list)): instance = ex_list[idx] for proc in bi_uni_pipeline: instance = proc(instance) batch.append(instance) batch_tensor = seq2seq_loader.batch_list_to_batch_tensors( batch) batch = [ t.to(device) if t is not None else None for t in batch_tensor ] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx = batch predict_bleu = args.predict_bleu * torch.ones( [input_ids.shape[0]], device=input_ids.device) # B oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=predict_bleu, train_ks=True, train_vae=args.train_vae) logits = torch.nn.functional.softmax(logits, dim=1) labels = logits[:, 1].cpu().numpy() # print(labels) for i in range(len(labels)): line = batch_src[i].strip() line += "\t" line += str(labels[i]) out.write(line) out.write("\n")
def main(): args = process_args() if args.loss_type == 'mlm': assert args.neg_num == 0 and args.multiple_neg == 0 elif args.loss_type == 'nsp': assert int(args.bi_prob) == 1 and args.max_pred == 0 and args.neg_num > 0 if args.adaptive_weight == 1: assert args.neg_num > 1 if args.add_boundary == 1: assert args.inc_full_hist if args.world_size > 1: print('global_rank: {}, local rank: {}'.format(args.global_rank, args.local_rank)) # Input format: [CLS] img [SEP] hist [SEP_0] ques [SEP_1] ans [SEP] args.max_seq_length = args.len_vis_input + 2 + args.max_len_hist_ques + 2 + args.max_len_ans + 1 args.mask_image_regions = (args.vis_mask_prob > 0) # whether to mask out image regions args.dist_url = args.dist_url.replace('[PT_OUTPUT_DIR]', args.output_dir) # arguments inspection assert args.enable_butd, 'only support region attn! featmap attn deprecated' if args.enable_butd: if args.visdial_v == '1.0': assert (args.len_vis_input == 36) or (args.len_vis_input == 0) elif args.visdial_v == '0.9': if (args.len_vis_input == 100): args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join(args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' # output config os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join( args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) logging.basicConfig( filename=os.path.join(args.output_dir, args.log_file), filemode='w', format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) ch = logging.StreamHandler(sys.stdout) ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) ch.setLevel(logging.INFO) logger.addHandler(ch) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.global_rank) logger.info('Arguments: %s\n' % (' '.join(sys.argv[:]))) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int( args.train_batch_size / args.gradient_accumulation_steps) # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # plotting loss, optional if args.enable_visdom: import visdom vis = visdom.Visdom(port=args.visdom_port, env=args.output_dir) vis_window = {'iter': None, 'score': None} tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank)) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer() if args.tokenized_input else tokenizer assert args.do_train logger.info('Max seq length: %d, batch size: %d\n' % (args.max_seq_length, args.train_batch_size)) bi_uni_pipeline = [Preprocess4TrainVisdial(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={'len_vis_input': args.len_vis_input, 'max_len_hist_ques': args.max_len_hist_ques, 'max_len_ans': args.max_len_ans}, mask_image_regions=args.mask_image_regions, mode="s2s", vis_mask_prob=args.vis_mask_prob, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, image_features_hdfpath=args.image_features_hdfpath, visdial_v=args.visdial_v, pad_hist=args.pad_hist, finetune=args.finetune, only_mask_ans=args.only_mask_ans, add_boundary=args.add_boundary, only_qa=args.only_qa), Preprocess4TrainVisdial(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={'len_vis_input': args.len_vis_input, 'max_len_hist_ques': args.max_len_hist_ques, 'max_len_ans': args.max_len_ans}, mask_image_regions=args.mask_image_regions, mode="bi", vis_mask_prob=args.vis_mask_prob, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, image_features_hdfpath=args.image_features_hdfpath, visdial_v=args.visdial_v, pad_hist=args.pad_hist, finetune=args.finetune, only_mask_ans=args.only_mask_ans, add_boundary=args.add_boundary, only_qa=args.only_qa)] train_dataset = VisdialDataset( args.src_file, args.train_batch_size, data_tokenizer, use_num_imgs=args.use_num_imgs, bi_uni_pipeline=bi_uni_pipeline, s2s_prob=args.s2s_prob, bi_prob=args.bi_prob, is_train=args.do_train, neg_num=args.neg_num, inc_gt_rel=args.inc_gt_rel, inc_full_hist=args.inc_full_hist, just_for_pretrain=args.just_for_pretrain, sub_sample=args.sub_sample) if args.world_size == 1: train_sampler = RandomSampler(train_dataset, replacement=False) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=batch_list_to_batch_tensors, pin_memory=True) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) t_total = int(len(train_dataloader) * args.num_train_epochs * 1. / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 relax_projection = 4 if args.relax_projection else 0 task_idx_proj = 3 if args.tasks == 'img2txt' else 0 mask_word_id, eos_word_ids, pad_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[PAD]"]) # index in BERT vocab: 103, 102, 0 if (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init assert args.scst == False, 'must init from maximum likelihood training' _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, visdial_v=args.visdial_v, loss_type=args.loss_type, neg_num=args.neg_num, adaptive_weight=args.adaptive_weight, add_attn_fuse=args.add_attn_fuse, no_h0=args.no_h0, no_vision=args.no_vision) global_step = 0 else: if args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path) global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, visdial_v=args.visdial_v, loss_type=args.loss_type, neg_num=args.neg_num, adaptive_weight=args.adaptive_weight, add_attn_fuse=args.add_attn_fuse, no_h0=args.no_h0, no_vision=args.no_vision) del model_recover torch.cuda.empty_cache() if args.fp16: model.half() # cnn.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State( optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, schedule=args.sche_mode, t_total=t_total) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) logger.info(" Loader length = %d", len(train_dataloader)) model.train() start_epoch = 1 logger.info("Begin training from epoch = %d", start_epoch) t0 = time.time() for i_epoch in trange(start_epoch, args.num_train_epochs + 1, desc="Epoch"): if args.multiple_neg and i_epoch > 1: train_dataset = VisdialDataset( args.src_file, args.train_batch_size, data_tokenizer, use_num_imgs=args.use_num_imgs, bi_uni_pipeline=bi_uni_pipeline, s2s_prob=args.s2s_prob, bi_prob=args.bi_prob, is_train=args.do_train, neg_num=args.neg_num, inc_gt_rel=args.inc_gt_rel, inc_full_hist=args.inc_full_hist, just_for_pretrain=args.just_for_pretrain, sub_sample=args.sub_sample) if args.world_size == 1: train_sampler = RandomSampler(train_dataset, replacement=False) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=batch_list_to_batch_tensors, pin_memory=True) if args.local_rank >= 0: train_sampler.set_epoch(i_epoch - 1) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') nbatches = len(train_dataloader) losses = [] pretext_loss = [] mlm_losses = [] nsp_losses = [] for step, batch in enumerate(iter_bar): batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, lm_label_ids, masked_pos, masked_weights, is_next, \ task_idx, vis_masked_pos, img, vis_pe = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() if args.enable_butd: conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data loss_tuple = model(conv_feats, vis_pe, input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, vis_masked_pos=vis_masked_pos, mask_image_regions=args.mask_image_regions, drop_worst_ratio=args.max_drop_worst_ratio if i_epoch > args.drop_after else 0) # disable pretext_loss_deprecated for now masked_lm_loss, pretext_loss_deprecated, nsp_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. For dist, this is done through gradient addition. masked_lm_loss = masked_lm_loss.mean() pretext_loss_deprecated = pretext_loss_deprecated.mean() nsp_loss = nsp_loss.mean() loss = masked_lm_loss + pretext_loss_deprecated + nsp_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) losses.append(loss.item()) mlm_losses.append(masked_lm_loss.item()) pretext_loss.append(pretext_loss_deprecated.item()) nsp_losses.append(nsp_loss.item()) if step % max(1, nbatches // 10) == 0: logger.info( "Epoch {}, Iter {}, Loss {:.4f}, MLM {:.4f}, NSP {:.4f}, Elapse time {:.2f}\n".format( i_epoch, step, np.mean(losses), np.mean(mlm_losses), np.mean(nsp_losses), time.time() - t0)) if args.enable_visdom: if vis_window['iter'] is None: vis_window['iter'] = vis.line( X=np.tile(np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack((np.asarray([np.mean(losses)]),)), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total']) ) else: vis.line( X=np.tile(np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack((np.asarray([np.mean(losses)]),)), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total']), win=vis_window['iter'], update='append' ) # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step / t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.%d.%.3f.bin" % (i_epoch, np.mean(losses))) if args.global_rank in (-1, 0): # save model if the first device or no dist torch.save(copy.deepcopy(model_to_save).cpu().state_dict(), output_model_file) logger.info("Save model to %s", output_model_file) logger.info("Finish training epoch %d, avg loss: %.2f and takes %.2f seconds" % ( i_epoch, np.mean(losses), time.time() - t0)) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.world_size > 1: torch.distributed.barrier()
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) if args.num_encs > 1: args = format_mulenc_args(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim_list = [ int(valid_json[utts[0]]["input"][i]["shape"][-1]) for i in range(args.num_encs) ] odim = int(valid_json[utts[0]]["output"][0]["shape"][-1]) for i in range(args.num_encs): logging.info("stream{}: input dims : {}".format(i + 1, idim_list[i])) logging.info("#output dims: " + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = "ctc" logging.info("Pure CTC mode") elif args.mtlalpha == 0.0: mtl_mode = "att" logging.info("Pure attention mode") else: mtl_mode = "mtl" logging.info("Multitask learning mode") if (args.enc_init is not None or args.dec_init is not None) and args.num_encs == 1: model = load_trained_modules(idim_list[0], odim, args) else: model_class = dynamic_import(args.model_module) model = model_class( idim_list[0] if args.num_encs == 1 else idim_list, odim, args ) assert isinstance(model, ASRInterface) logging.info( " Total parameter of the model = " + str(sum(p.numel() for p in model.parameters())) ) if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit) ) torch_load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps( (idim_list[0] if args.num_encs == 1 else idim_list, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True, ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) args.batch_size *= args.ngpu if args.num_encs > 1: # TODO(ruizhili): implement data parallel for multi-encoder setup. raise NotImplementedError( "Data parallel is not supported for multi-encoder setup." ) # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) # Setup an optimizer if args.opt == "adadelta": optimizer = torch.optim.Adadelta( model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay ) elif args.opt == "adam": optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt( model.parameters(), args.adim, args.transformer_warmup_steps, args.transformer_lr ) elif args.opt == "rmsprop": optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0008, alpha=0.95) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux" ) raise e if args.opt == "noam": model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype ) else: model, optimizer = amp.initialize( model, optimizer, opt_level=args.train_dtype ) use_apex = True from espnet.nets.pytorch_backend.ctc import CTC amp.register_float_function(CTC, "loss_fn") amp.init() logging.warning("register ctc as float function") else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter if args.num_encs == 1: converter = CustomConverter(subsampling_factor=model.subsample[0], dtype=dtype) else: converter = CustomConverterMulEnc( [i[0] for i in model.subsample_list], dtype=dtype ) # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0, ) valid = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0, ) load_tr = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_iter = ChainerDataLoader( dataset=TransformDataset(train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=args.n_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0], ) valid_iter = ChainerDataLoader( dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) # Set up a trainer updater = CustomUpdater( model, args.grad_clip, {"main": train_iter}, optimizer, device, args.ngpu, args.grad_noise, args.accum_grad, use_apex=use_apex, ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch if args.save_interval_iters > 0: trainer.extend( CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend( CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu) ) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0 and "transformer" in args.model_module: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["input"][0]["shape"][1]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, ) trainer.extend(att_reporter, trigger=(1, "epoch")) else: att_reporter = None # Make a plot for training and validation values if args.num_encs > 1: report_keys_loss_ctc = [ "main/loss_ctc{}".format(i + 1) for i in range(model.num_encs) ] + ["validation/main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)] report_keys_cer_ctc = [ "main/cer_ctc{}".format(i + 1) for i in range(model.num_encs) ] + ["validation/main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)] trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_ctc", "validation/main/loss_ctc", "main/loss_att", "validation/main/loss_att", ] + ([] if args.num_encs == 1 else report_keys_loss_ctc), "epoch", file_name="loss.png", ) ) trainer.extend( extensions.PlotReport( ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png" ) ) trainer.extend( extensions.PlotReport( ["main/cer_ctc", "validation/main/cer_ctc"] + ([] if args.num_encs == 1 else report_keys_loss_ctc), "epoch", file_name="cer.png", ) ) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) if mtl_mode != "ctc": trainer.extend( snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger("validation/main/acc"), ) # save snapshot which contains model and optimizer states if args.save_interval_iters > 0: trainer.extend( torch_snapshot(filename="snapshot.iter.{.updater.iteration}"), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend(torch_snapshot(), trigger=(1, "epoch")) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc" and mtl_mode != "ctc": trainer.extend( restore_snapshot( model, args.outdir + "/model.acc.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot( model, args.outdir + "/model.loss.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # lr decay in rmsprop if args.opt == "rmsprop": if args.criterion == "acc" and mtl_mode != "ctc": trainer.extend( restore_snapshot( model, args.outdir + "/model.acc.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( rmsprop_lr_decay(args.lr_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot( model, args.outdir + "/model.loss.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( rmsprop_lr_decay(args.lr_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration")) ) report_keys = [ "epoch", "iteration", "main/loss", "main/loss_ctc", "main/loss_att", "validation/main/loss", "validation/main/loss_ctc", "validation/main/loss_att", "main/acc", "validation/main/acc", "main/cer_ctc", "validation/main/cer_ctc", "elapsed_time", ] + ([] if args.num_encs == 1 else report_keys_cer_ctc + report_keys_loss_ctc) if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][ "eps" ], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") if args.opt == "rmsprop": trainer.extend( extensions.observe_value( "lr", lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][ "lr" ], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("lr") if args.report_cer: report_keys.append("validation/main/cer") if args.report_wer: report_keys.append("validation/main/wer") trainer.extend( extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, "iteration"), ) trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend( TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def __init__(self, optimizer: optim.Optimizer, **kwargs): from apex import amp self.optimizer = optimizer self.amp = amp.init()
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="./configs/seq.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.deprecated.init_process_group(backend="nccl", init_method="env://") cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("masktextspotterv3", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) checkpointer = DetectronCheckpointer(cfg, model) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) output_folders = [None] * len(cfg.DATASETS.TEST) if cfg.OUTPUT_DIR: dataset_names = cfg.DATASETS.TEST for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) model_name = cfg.MODEL.WEIGHT.split('/')[-1] for output_folder, data_loader_val in zip(output_folders, data_loaders_val): inference( model, data_loader_val, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, model_name=model_name, cfg=cfg, ) synchronize()
def setUp(self): self.handle = amp.init(enabled=True) self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32) common_init(self)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--build-model", default="", metavar="FILE", help="path to NAS model build file", type=str, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) assert len(args.build_model) != 0, 'args.build_model should be provided' model_config = json.load(open(args.build_model, 'r')) if isinstance(model_config, list): assert len(model_config) == 1 model_config = model_config[0] print('Testing single model:', model_config) model = build_detection_model(cfg, model_config) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) # output_folders = [None] * len(cfg.DATASETS.TEST) # dataset_names = cfg.DATASETS.TEST dataset_names = cfg.DATASETS.NAS_VAL if not cfg.NAS.TRAIN_SINGLE_MODEL else cfg.DATASETS.TEST output_folders = [None] * len(dataset_names) if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, test_only=cfg.TEST_ONLY) if cfg.NAS.TRAIN_SINGLE_MODEL: if get_rank() == 0: print('==' * 20, 'Evaluating single model...', '==' * 20) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, c2d_json_path=cfg.MODEL.SEG_BRANCH.JSON_PATH, cfg=cfg, test_only=cfg.TEST_ONLY) synchronize() elif not cfg.NAS.SKIP_NAS_TEST: if get_rank() == 0: print('==' * 10, 'Start NAS testing', '==' * 10) timer = Timer() timer.tic() searcher = PathPrioritySearch(cfg, './nas_test') searcher.generate_fair_test( ) # load cache results and generate new model for test searcher.search(model, output_folders, dataset_names, distributed) searcher.save_topk() total_time = timer.toc() total_time_str = get_time_str(total_time) if get_rank() == 0: print('Finish NAS testing, total time:{}'.format(total_time_str)) return else: print('Skipping NAS testing...')
def setUp(self): self.handle = amp.init(enabled=True, patch_type=torch.bfloat16) common_init(self)
from ..logger import info, warn from ..metrics import eval_psnr_and_ssim from ..utils import print_network, tensor2im from .generators import ProSR from bisect import bisect_left from collections import OrderedDict import apex from apex import amp import os import torch import numpy as np amp_handle = amp.init(enabled=True) class SimultaneousMultiscaleTrainer(object): """multiscale training without curriculum scheduling""" def __init__(self, opt, training_dataset, save_dir="data/checkpoints", resume_from=None): super(SimultaneousMultiscaleTrainer, self).__init__() self.opt = opt self.save_dir = save_dir self.training_dataset = training_dataset self.start_epoch = 0 self.progress = self.start_epoch / opt.train.epochs self.blend = 1 # training variables
def main(): args = parse_args_train(sys.argv[1:]) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # for multi-GPUs if args.n_gpus > 1: batch_size = args.batch_size * args.n_gpus accum_grad_n_steps = max(1, args.accum_grad_n_steps // args.n_gpus) else: batch_size = args.batch_size accum_grad_n_steps = args.accum_grad_n_steps # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, shuffle=args.shuffle, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) for s in args.eval_sets ] args.vocab = train_set.vocab # Set save path if args.resume: args.save_path = os.path.dirname(args.resume) dir_name = os.path.basename(args.save_path) else: dir_name = set_lm_name(args) args.save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) args.save_path = set_save_path(args.save_path) # avoid overwriting # Set logger set_logger(os.path.join(args.save_path, 'train.log'), stdout=args.stdout) # Model setting model = build_lm(args, args.save_path) if not args.resume: # Save nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(args.save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(args.save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(args.save_path, 'wp.model')) for k, v in sorted(args.items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info('torch version: %s' % str(torch.__version__)) logger.info(model) # Set optimizer resume_epoch = int(args.resume.split('-')[-1]) if args.resume else 0 optimizer = set_optimizer( model, 'sgd' if resume_epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = args.lm_type in ['transformer', 'transformer_xl'] scheduler = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.get('transformer_d_model', 0), factor=args.lr_factor, noam=args.optimizer == 'noam', save_checkpoints_topk=10 if is_transformer else 1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, scheduler) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # GPU setting args.use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp, scaler = None, None if args.n_gpus >= 1: model.cudnn_setting( deterministic=not (is_transformer or args.cudnn_benchmark), benchmark=not is_transformer and args.cudnn_benchmark) # Mixed precision training setting if args.use_apex: if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): scaler = torch.cuda.amp.GradScaler() else: from apex import amp model, scheduler.optimizer = amp.initialize( model, scheduler.optimizer, opt_level=args.train_dtype) amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) model.cuda() model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) else: model = CPUWrapperLM(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(args, model) if args.resume: n_steps = scheduler.n_steps * accum_grad_n_steps reporter.resume(n_steps, resume_epoch) # Save conf file as a yaml file if not args.resume: save_config(args, os.path.join(args.save_path, 'conf.yml')) # NOTE: save after reporter for wandb ID hidden = None start_time_train = time.time() for ep in range(resume_epoch, args.n_epochs): for ys_train, is_new_epoch in train_set: hidden = train(model, train_set, dev_set, scheduler, reporter, logger, args, accum_grad_n_steps, amp, scaler, hidden) # Save checkpoint and validate model per epoch if reporter.n_epochs + 1 < args.eval_start_epoch: scheduler.epoch() # lr decay reporter.epoch() # plot # Save model scheduler.save_checkpoint(model, args.save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) else: start_time_eval = time.time() # dev model.module.reset_length(args.bptt) ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) scheduler.epoch(ppl_dev) # lr decay reporter.epoch(ppl_dev, name='perplexity') # plot reporter.add_scalar('dev/perplexity', ppl_dev) logger.info('PPL (%s, ep:%d): %.2f' % (dev_set.set, reporter.n_epochs, ppl_dev)) if scheduler.is_topk or is_transformer: # Save model scheduler.save_checkpoint(model, args.save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) # test ppl_test_avg = 0. for eval_set in eval_sets: model.module.reset_length(args.bptt) ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) logger.info('PPL (%s, ep:%d): %.2f' % (eval_set.set, reporter.n_epochs, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info( 'PPL (avg., ep:%d): %.2f' % (reporter.n_epochs, ppl_test_avg / len(eval_sets))) logger.info('Evaluation time: %.2f min' % ((time.time() - start_time_eval) / 60)) # Early stopping if scheduler.is_early_stop: break # Convert to fine-tuning stage if reporter.n_epochs == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) if reporter.n_epochs >= args.n_epochs: break logger.info('Total time: %.2f hour' % ((time.time() - start_time_train) / 3600)) reporter.close() return args.save_path
def main(): parser = argparse.ArgumentParser() parser.add_argument("--generation_dataset", default='openi', type=str, help=["mimic-cxr, openi"]) parser.add_argument("--vqa_rad", default="all", type=str, choices=["all", "chest", "head", "abd"]) parser.add_argument("--data_set", default="train", type=str, help="train | valid") parser.add_argument('--img_hidden_sz', type=int, default=2048, help="Whether to use amp for fp16") parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help= "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased." ) parser.add_argument( "--mlm_task", type=str, default=True, help="The model will train only mlm task!! | True | False") parser.add_argument("--train_batch_size", default=2, type=int, help="Total batch size for training.") parser.add_argument("--num_train_epochs", default=5, type=int, help="Total number of training epochs to perform.") parser.add_argument( '--from_scratch', action='store_true', default=False, help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument("--img_encoding", type=str, default='fully_use_cnn', choices=['random_sample', 'fully_use_cnn']) parser.add_argument( '--len_vis_input', type=int, default=256, help="The length of visual token input" ) #visual token의 fixed length를 100이라 하면, <Unknown> token 100개가 되고, 100개의 word 생성 가능. parser.add_argument('--max_len_b', type=int, default=253, help="Truncate_config: maximum length of segment B.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=10, help="Max tokens of prediction.") parser.add_argument( '--s2s_prob', default=1, type=float, help= "Percentage of examples that are bi-uni-directional LM (seq2seq). This must be turned off!!!!!!! because this is not for seq2seq model!!!" ) parser.add_argument( '--bi_prob', default=0, type=float, help="Percentage of examples that are bidirectional LM.") parser.add_argument('--hidden_size', type=int, default=768) parser.add_argument('--bar', default=False, type=str, help="True or False") parser.add_argument("--config_path", default='./pretrained_model/non_cross/config.json', type=str, help="Bert config file path.") parser.add_argument( "--model_recover_path", default='./pretrained_model/non_cross/pytorch_model.bin', type=str, help="The file of fine-tuned pretraining model.") # model load parser.add_argument( "--output_dir", default='./output_model/base_noncross_mimic_2', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_file", default="training.log", type=str, help="The output directory where the log will be written.") parser.add_argument('--img_postion', default=True, help="It will produce img_position.") parser.add_argument( "--do_train", action='store_true', default=True, help="Whether to run training. This should ALWAYS be set to True.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) ############################################################################################################ parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--global_rank", type=int, default=-1, help="global_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', default=False, help= "Whether to use 32-bit float precision instead of 32-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', default=False, help="Whether to use amp for fp16") parser.add_argument('--new_segment_ids', default=False, action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument( '--trunc_seg', default='b', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument("--num_workers", default=20, type=int, help="Number of workers for the data loader.") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument( '--image_root', type=str, default='/home/mimic-cxr/dataset/image_preprocessing/re_512_3ch/Train') parser.add_argument('--split', type=str, nargs='+', default=['train', 'valid']) parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist_url', default='file://[PT_OUTPUT_DIR]/nonexistent_file', type=str, help='url used to set up distributed training') parser.add_argument('--sche_mode', default='warmup_linear', type=str, help="warmup_linear | warmup_constant | warmup_cosine") parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--use_num_imgs', default=-1, type=int) parser.add_argument('--max_drop_worst_ratio', default=0, type=float) parser.add_argument('--drop_after', default=6, type=int) parser.add_argument('--tasks', default='report_generation', help='report_generation | vqa') parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") args = parser.parse_args() print('global_rank: {}, local rank: {}'.format(args.global_rank, args.local_rank)) args.max_seq_length = args.max_len_b + args.len_vis_input + 3 # +3 for 2x[SEP] and [CLS] args.dist_url = args.dist_url.replace('[PT_OUTPUT_DIR]', args.output_dir) if args.tasks == 'vqa': wandb.init(config=args, project="VQA") wandb.config["more"] = "custom" args.src_file = '/home/mimic-cxr/dataset/data_RAD' args.file_valid_jpgs = '/home/mimic-cxr/dataset/vqa_rad_original_set.json' else: if args.generation_dataset == 'mimic-cxr': wandb.init(config=args, project="report_generation") wandb.config["more"] = "custom" args.src_file = '/home/mimic-cxr/new_dset/Train_253.jsonl' args.file_valid_jpgs = '/home/mimic-cxr/new_dset/Train_253.jsonl' else: wandb.init(config=args, project="report_generation") wandb.config["more"] = "custom" args.src_file = '/home/mimic-cxr/dataset/open_i/Train_openi.jsonl' args.file_valid_jpgs = '/home/mimic-cxr/dataset/open_i/Valid_openi.jsonl' print(" # PID :", os.getpid()) os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) logging.basicConfig( filename=os.path.join(args.output_dir, args.log_file), filemode='w', format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") print("device", device) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) print("device", device) n_gpu = 1 torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.global_rank) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) if args.do_train: print("args.mask_prob", args.mask_prob) print("args.train_batch_size", args.train_batch_size) bi_uni_pipeline = [ data_loader.Preprocess4Seq2seq( args, args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, args.bar, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mode="s2s", len_vis_input=args.len_vis_input, local_rank=args.local_rank, load_vqa_set=(args.tasks == 'vqa')) ] bi_uni_pipeline.append( data_loader.Preprocess4Seq2seq( args, args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, args.bar, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mode="bi", len_vis_input=args.len_vis_input, local_rank=args.local_rank, load_vqa_set=(args.tasks == 'vqa'))) train_dataset = data_loader.Img2txtDataset( args, args.data_set, args.src_file, args.image_root, args.split, args.train_batch_size, tokenizer, args.max_seq_length, file_valid_jpgs=args.file_valid_jpgs, bi_uni_pipeline=bi_uni_pipeline, use_num_imgs=args.use_num_imgs, s2s_prob=args.s2s_prob, # this must be set to 1. bi_prob=args.bi_prob, tasks=args.tasks) if args.world_size == 1: train_sampler = RandomSampler(train_dataset, replacement=False) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=batch_list_to_batch_tensors, pin_memory=True) t_total = int( len(train_dataloader) * args.num_train_epochs * 1. / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 relax_projection = 4 if args.relax_projection else 0 task_idx_proj = 3 if args.tasks == 'report_generation' else 0 mask_word_id, eos_word_ids, pad_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[PAD]"]) # index in BERT vocab: 103, 102, 0 # BERT model will be loaded! from scratch if (args.model_recover_path is None): _state_dict = {} if args.from_scratch else None _state_dict = {} model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, args=args, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, len_vis_input=args.len_vis_input, tasks=args.tasks) print("scratch model's statedict : ") for param_tensor in model.state_dict(): print(param_tensor, "\t", model.state_dict()[param_tensor].size()) global_step = 0 print("The model will train from scratch") else: print("Task :", args.tasks, args.s2s_prob) print("Recoverd model :", args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(model_recover_path) for key in list(model_recover.keys()): model_recover[key.replace('enc.', '').replace( 'mlm.', 'cls.')] = model_recover.pop(key) global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, args=args, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, len_vis_input=args.len_vis_input, tasks=args.tasks) model.load_state_dict(model_recover, strict=False) print("The pretrained model loaded and fine-tuning.") del model_recover torch.cuda.empty_cache() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = DataParallelImbalance(model) wandb.watch(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, schedule=args.sche_mode, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load( os.path.join(args.output_dir, "optim.{0}.bin".format(recover_step))) if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") model.train() print("Total Parameters:", sum([p.nelement() for p in model.parameters()])) if recover_step: start_epoch = recover_step + 1 print("start_epoch", start_epoch) else: start_epoch = 1 for i_epoch in trange(start_epoch, args.num_train_epochs + 1, desc="Epoch"): if args.local_rank >= 0: train_sampler.set_epoch(i_epoch - 1) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') nbatches = len(train_dataloader) train_loss = [] avg_loss = 0.0 batch_count = 0 for step, batch in enumerate(iter_bar): batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, lm_label_ids, masked_pos, masked_weights, task_idx, img, vis_pe, ans_labels, ans_type, organ = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() loss_tuple = model(img, vis_pe, input_ids, segment_ids, input_mask, lm_label_ids, ans_labels, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, drop_worst_ratio=args.max_drop_worst_ratio if i_epoch > args.drop_after else 0, ans_type=ans_type) masked_lm_loss, vqa_loss = loss_tuple batch_count += 1 if args.tasks == 'report_generation': masked_lm_loss = masked_lm_loss.mean() loss = masked_lm_loss else: vqa_loss = vqa_loss.mean() loss = vqa_loss iter_bar.set_description('Iter (loss=%5.3f)' % (loss.item())) train_loss.append(loss.item()) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 wandb.log({"train_loss": np.mean(train_loss)}) logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_config_file = os.path.join(args.output_dir, 'config.json') with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) output_model_file = os.path.join(args.output_dir, "model.{0}.bin".format(i_epoch)) output_optim_file = os.path.join(args.output_dir, "optim.{0}.bin".format(i_epoch)) if args.global_rank in ( -1, 0): # save model if the first device or no dist torch.save( copy.deepcopy(model_to_save).cpu().state_dict(), output_model_file) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.world_size > 1: torch.distributed.barrier()
def __init__( self, model: torch.nn.Module, criterion: torch.nn.Module, optimizer: torch.optim.Optimizer, device: torch.device, save_root: str, train_dataset: torch.utils.data.Dataset, valid_dataset: Optional[torch.utils.data.Dataset] = None, valid_metrics: Optional[Dict] = None, preview_batch: Optional[torch.Tensor] = None, preview_tile_shape: Optional[Tuple[int, ...]] = None, preview_overlap_shape: Optional[Tuple[int, ...]] = None, preview_interval: int = 5, exp_name: Optional[str] = None, example_input: Optional[torch.Tensor] = None, enable_save_trace: bool = False, batchsize: int = 1, num_workers: int = 0, schedulers: Optional[Dict[Any, Any]] = None, overlay_alpha: float = 0.2, enable_videos: bool = False, enable_tensorboard: bool = True, tensorboard_root_path: Optional[str] = None, apply_softmax_for_prediction: bool = True, ignore_errors: bool = False, ipython_shell: bool = False, num_classes: Optional[int] = None, sample_plotting_handler: Optional[Callable] = None, preview_plotting_handler: Optional[Callable] = None, mixed_precision: bool = False, ): if preview_batch is not None and\ (preview_tile_shape is None or preview_overlap_shape is None): raise ValueError( 'If preview_batch is set, you will also need to specify ' 'preview_tile_shape and preview_overlap_shape!') if num_workers > 1 and 'PatchCreator' in str(type(train_dataset)) \ and not getattr(train_dataset, 'in_memory'): logger.warning( 'Training with num_workers > 1 can cause instabilities if ' 'you are using PatchCreator without in_memory=True.\nBe advised that PatchCreator ' 'might randomly deliver broken batches in your training and ' 'can crash it at any point of time because HDF5 is not fork-safe.\n' 'Please set num_workers to 1 or 0 or consider enabling ' 'in_memory=True for PatchCreator.\n') self.ignore_errors = ignore_errors self.ipython_shell = ipython_shell self.device = device try: model.to(device) except RuntimeError as exc: if isinstance(model, torch.jit.ScriptModule): # "RuntimeError: to is not supported on TracedModules" # But .cuda() works for some reason. Using this messy # workaround in the hope that we can drop it soon. # TODO: Remove this when ScriptModule.to() is supported # See https://github.com/pytorch/pytorch/issues/7354 if 'cuda' in str(self.device): # (Ignoring device number!) model.cuda() else: raise exc self.model = model self.criterion = criterion.to(device) self.optimizer = optimizer self.train_dataset = train_dataset self.valid_dataset = valid_dataset self.valid_metrics = valid_metrics self.preview_batch = preview_batch self.preview_tile_shape = preview_tile_shape self.preview_overlap_shape = preview_overlap_shape self.preview_interval = preview_interval self.overlay_alpha = overlay_alpha self.save_root = os.path.expanduser(save_root) self.example_input = example_input self.enable_save_trace = enable_save_trace self.batchsize = batchsize self.num_workers = num_workers self.apply_softmax_for_prediction = apply_softmax_for_prediction self.sample_plotting_handler = sample_plotting_handler self.preview_plotting_handler = preview_plotting_handler self.mixed_precision = mixed_precision self._tracker = HistoryTracker() self._timer = Timer() self._first_plot = True self._shell_info = dedent(""" Entering IPython training shell. To continue, hit Ctrl-D twice. To terminate, set self.terminate = True and then hit Ctrl-D twice. """).strip() if self.mixed_precision: from apex import amp self.amp_handle = amp.init() if exp_name is None: # Auto-generate a name based on model name and ISO timestamp timestamp = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') exp_name = model.__class__.__name__ + '__' + timestamp self.exp_name = exp_name self.save_path = os.path.join(save_root, exp_name) if os.path.isdir(self.save_path): raise RuntimeError( f'{self.save_path} already exists.\nPlease choose a ' 'different combination of save_root and exp_name.') os.makedirs(self.save_path) _change_log_file_to(f'{self.save_path}/core.log') logger.info(f'Writing files to save_path {self.save_path}/\n') self.terminate = False self.step = 0 self.epoch = 0 if schedulers is None: schedulers = {'lr': StepLR(optimizer, 1000, 1)} # No-op scheduler self.schedulers = schedulers self.__lr_closetozero_alreadytriggered = False # Used in periodic scheduler handling self._lr_nhood = deque( maxlen=3 ) # Keeps track of the last, current and next learning rate self.num_classes = num_classes if enable_videos: try: import moviepy except: logger.warning( 'moviepy is not installed. Disabling video logs.') enable_videos = False self.enable_videos = enable_videos self.tb = None # Tensorboard handler if enable_tensorboard: if self.sample_plotting_handler is None: self.sample_plotting_handler = handlers._tb_log_sample_images if self.preview_plotting_handler is None: self.preview_plotting_handler = handlers._tb_log_preview if tensorboard_root_path is None: tb_path = self.save_path else: tensorboard_root_path = os.path.expanduser( tensorboard_root_path) tb_path = os.path.join(tensorboard_root_path, self.exp_name) os.makedirs(tb_path, exist_ok=True) self.tb = tensorboardX.SummaryWriter(logdir=tb_path, flush_secs=20) self.train_loader = DataLoader( self.train_dataset, batch_size=self.batchsize, shuffle=True, num_workers=self.num_workers, pin_memory=True, timeout=60 if self.num_workers > 0 else 0, worker_init_fn=_worker_init_fn) # num_workers is set to 0 for valid_loader because validation background processes sometimes # fail silently and stop responding, bringing down the whole training process. # This issue might be related to https://github.com/pytorch/pytorch/issues/1355. # The performance impact of disabling multiprocessing here is low in normal settings, # because the validation loader doesn't perform expensive augmentations, but just reads # data from hdf5s. if valid_dataset is not None: self.valid_loader = DataLoader(self.valid_dataset, self.batchsize, shuffle=True, num_workers=0, pin_memory=True, worker_init_fn=_worker_init_fn) self.best_val_loss = np.inf # Best recorded validation loss self.valid_metrics = {} if valid_metrics is None else valid_metrics
def setUp(self): self.handle = amp.init(enabled=True) common_init(self)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") # decoding parameters parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument("--input_file", type=str, help="Input file") parser.add_argument('--subset', type=int, default=0, help="Decode a subset of the input dataset.") parser.add_argument("--output_file", type=str, help="output file") parser.add_argument("--split", type=str, default="", help="Data split (train/val/test).") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument('--topk', type=int, default=10, help="Value of K.") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Ignore the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=None, type=int) parser.add_argument('--need_score_traces', action='store_true') parser.add_argument('--ngram_size', type=int, default=3) parser.add_argument('--mode', default="s2s", choices=["s2s", "l2r", "both"]) parser.add_argument('--max_tgt_length', type=int, default=128, help="maximum length of target sequence") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") parser.add_argument('--not_predict_token', type=str, default=None, help="Do not predict the tokens during decoding.") args = parser.parse_args() if args.need_score_traces and args.beam_size <= 1: raise ValueError( "Score trace is only available for beam search with beam size > 1." ) if args.max_tgt_length >= args.max_seq_length - 2: raise ValueError("Maximum tgt length exceeds max seq length - 2.") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # tokenizer = BertTokenizer.from_pretrained( # args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer( vocab_file= '/ps2/intern/clsi/BERT/bert_weights/cased_L-24_H-1024_A-16/vocab.txt', do_lower_case=args.do_lower_case) tokenizer.max_len = args.max_seq_length pair_num_relation = 0 bi_uni_pipeline = [] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seqDecoder( list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, max_tgt_length=args.max_tgt_length, new_segment_ids=args.new_segment_ids, mode="s2s", num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"]) def _get_token_id_set(s): r = None if s: w_list = [] for w in s.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) r = set(tokenizer.convert_tokens_to_ids(w_list)) return r forbid_ignore_set = _get_token_id_set(args.forbid_ignore_word) not_predict_set = _get_token_id_set(args.not_predict_token) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=pair_num_relation, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, not_predict_set=not_predict_set, ngram_size=args.ngram_size, min_len=args.min_len, mode=args.mode, max_position_embeddings=args.max_seq_length, ffn_type=args.ffn_type, num_qkv=args.num_qkv, seg_emb=args.seg_emb, pos_shift=args.pos_shift, topk=args.topk, config_path=args.config_path) del model_recover if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) torch.cuda.empty_cache() model.eval() next_i = 0 max_src_length = args.max_seq_length - 2 - args.max_tgt_length ## for YFG style json # testset = loads_json(args.input_file, 'Load Test Set: '+args.input_file) # if args.subset > 0: # logger.info("Decoding subset: %d", args.subset) # testset = testset[:args.subset] with open(args.input_file, encoding="utf-8") as fin: data = json.load(fin) # input_lines = [x.strip() for x in fin.readlines()] # if args.subset > 0: # logger.info("Decoding subset: %d", args.subset) # input_lines = input_lines[:args.subset] # data_tokenizer = WhitespaceTokenizer() if args.tokenized_input else tokenizer # input_lines = [data_tokenizer.tokenize( # x)[:max_src_length] for x in input_lines] # input_lines = sorted(list(enumerate(input_lines)), # key=lambda x: -len(x[1])) # output_lines = [""] * len(input_lines) # score_trace_list = [None] * len(input_lines) # total_batch = math.ceil(len(input_lines) / args.batch_size) data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer PQA_dict = {} #will store the generated distractors dis_tot = 0 dis_n = 0 len_tot = 0 hypothesis = {} ##change to process one by one and store the distractors in PQA json form ##with tqdm(total=total_batch) as pbar: # for example in tqdm(testset): # question_id = str(example['id']['file_id']) + '_' + str(example['id']['question_id']) # if question_id in hypothesis: # continue # dis_n += 1 # if dis_n % 2000 == 0: # logger.info("Already processed: "+str(dis_n)) counter = 0 for race_id, example in tqdm(data.items()): counter += 1 if args.subset > 0 and counter >= args.subset: break eg_dict = {} # eg_dict["question_id"] = question_id # eg_dict["question"] = ' '.join(example['question']) # eg_dict["context"] = ' '.join(example['article']) eg_dict["question"] = example['question'] eg_dict["context"] = example['context'] label = int(example["label"]) options = example["options"] answer = options[label] #new_distractors = [] pred1 = None pred2 = None pred3 = None #while next_i < len(input_lines): #_chunk = input_lines[next_i:next_i + args.batch_size] #line = example["context"].strip() + ' ' + example["question"].strip() question = example['question'] question = question.replace('_', ' ') line = ' '.join( nltk.word_tokenize(example['context']) + nltk.word_tokenize(question)) line = [data_tokenizer.tokenize(line)[:max_src_length]] # buf_id = [x[0] for x in _chunk] # buf = [x[1] for x in _chunk] buf = line #next_i += args.batch_size max_a_len = max([len(x) for x in buf]) instances = [] for instance in [(x, max_a_len) for x in buf]: for proc in bi_uni_pipeline: instances.append(proc(instance)) with torch.no_grad(): batch = seq2seq_loader.batch_list_to_batch_tensors(instances) batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, token_type_ids, position_ids, input_mask, mask_qkv, task_idx = batch # for i in range(1): #try max 10 times # if len(new_distractors) >= 3: # break traces = model(input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx, mask_qkv=mask_qkv) if args.beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces['pred_seq'] # print (np.array(output_ids).shape) # print (output_ids) else: output_ids = traces.tolist() # now only supports single batch decoding!!! # will keep the second and third sequence as backup for i in range(len(buf)): # print (len(buf), buf) for s in range(len(output_ids)): output_seq = output_ids[s] #w_ids = output_ids[i] #output_buf = tokenizer.convert_ids_to_tokens(w_ids) output_buf = tokenizer.convert_ids_to_tokens( output_seq) output_tokens = [] for t in output_buf: if t in ("[SEP]", "[PAD]"): break output_tokens.append(t) if s == 1: backup_1 = output_tokens if s == 2: backup_2 = output_tokens if pred1 is None: pred1 = output_tokens elif jaccard_similarity(pred1, output_tokens) < 0.5: if pred2 is None: pred2 = output_tokens elif pred3 is None: if jaccard_similarity(pred2, output_tokens) < 0.5: pred3 = output_tokens if pred1 is not None and pred2 is not None and pred3 is not None: break if pred2 is None: pred2 = backup_1 if pred3 is None: pred3 = backup_2 elif pred3 is None: pred3 = backup_1 # output_sequence = ' '.join(detokenize(output_tokens)) # print (output_sequence) # print (output_sequence) # if output_sequence.lower().strip() == answer.lower().strip(): # continue # repeated = False # for cand in new_distractors: # if output_sequence.lower().strip() == cand.lower().strip(): # repeated = True # break # if not repeated: # new_distractors.append(output_sequence.strip()) #hypothesis[question_id] = [pred1, pred2, pred3] new_distractors = [pred1, pred2, pred3] # print (new_distractors) # dis_tot += len(new_distractors) # # fill the missing ones with original distractors # for i in range(4): # if len(new_distractors) >= 3: # break # elif i == label: # continue # else: # new_distractors.append(options[i]) for dis in new_distractors: len_tot += len(dis) dis_n += 1 new_distractors = [ ' '.join(detokenize(dis)) for dis in new_distractors if dis is not None ] assert len(new_distractors) == 3, "Number of distractors WRONG" new_distractors.insert(label, answer) #eg_dict["generated_distractors"] = new_distractors eg_dict["options"] = new_distractors eg_dict["label"] = label #PQA_dict[question_id] = eg_dict PQA_dict[race_id] = eg_dict # reference = {} # for example in testset: # question_id = str(example['id']['file_id']) + '_' + str(example['id']['question_id']) # if question_id not in reference.keys(): # reference[question_id] = [example['distractor']] # else: # reference[question_id].append(example['distractor']) # _ = eval(hypothesis, reference) # assert len(PQA_dict) == len(data), "Number of examples WRONG" # logger.info("Average number of GENERATED distractor per question: "+str(dis_tot/dis_n)) logger.info("Average length of distractors: " + str(len_tot / dis_n)) with open(args.output_file, mode='w', encoding='utf-8') as f: json.dump(PQA_dict, f, indent=4)
def main(): parser = argparse.ArgumentParser() # General parser.add_argument( "--bert_model", default="bert-base-cased", type=str, help= "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default='tmp', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_file", default="training.log", type=str, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument( "--do_train", action='store_true', help="Whether to run training. This should ALWAYS be set to True.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=30, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--global_rank", type=int, default=-1, help="global_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 32-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--len_vis_input', type=int, default=100, help="The length of visual token input") parser.add_argument('--max_len_b', type=int, default=20, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='b', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=3, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=4, type=int, help="Number of workers for the data loader.") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") # Others for VLP parser.add_argument( "--src_file", default=['/mnt/dat/COCO/annotations/dataset_coco.json'], type=str, nargs='+', help="The input data file name.") parser.add_argument('--enable_visdom', action='store_true') parser.add_argument('--visdom_port', type=int, default=8888) # parser.add_argument('--resnet_model', type=str, default='imagenet_weights/resnet101.pth') parser.add_argument('--image_root', type=str, default='/mnt/dat/COCO/images') parser.add_argument('--dataset', default='coco', type=str, help='coco | flickr30k | cc') parser.add_argument('--split', type=str, nargs='+', default=['train', 'restval']) parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist_url', default='file://[PT_OUTPUT_DIR]/nonexistent_file', type=str, help='url used to set up distributed training') parser.add_argument( '--file_valid_jpgs', default='/mnt/dat/COCO/annotations/coco_valid_jpgs.json', type=str) parser.add_argument('--sche_mode', default='warmup_linear', type=str, help="warmup_linear | warmup_constant | warmup_cosine") parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--use_num_imgs', default=-1, type=int) parser.add_argument('--vis_mask_prob', default=0, type=float) parser.add_argument('--max_drop_worst_ratio', default=0, type=float) parser.add_argument('--drop_after', default=6, type=int) parser.add_argument( '--s2s_prob', default=1, type=float, help="Percentage of examples that are bi-uni-directional LM (seq2seq)." ) parser.add_argument( '--bi_prob', default=0, type=float, help="Percentage of examples that are bidirectional LM.") parser.add_argument('--enable_butd', action='store_true', help='set to take in region features') parser.add_argument( '--region_bbox_file', default= 'coco_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str) parser.add_argument( '--region_det_file_prefix', default= 'feat_cls_1000/coco_detection_vg_100dets_gvd_checkpoint_trainval', type=str) parser.add_argument('--tasks', default='img2txt', help='img2txt | vqa2') parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--scst', action='store_true', help='Self-critical sequence training') args = parser.parse_args() print('global_rank: {}, local rank: {}'.format(args.global_rank, args.local_rank)) args.max_seq_length = args.max_len_b + args.len_vis_input + 3 # +3 for 2x[SEP] and [CLS] args.mask_image_regions = (args.vis_mask_prob > 0 ) # whether to mask out image regions args.dist_url = args.dist_url.replace('[PT_OUTPUT_DIR]', args.output_dir) # arguments inspection assert (args.tasks in ('img2txt', 'vqa2')) assert args.enable_butd == True, 'only support region attn! featmap attn deprecated' assert ( not args.scst) or args.dataset == 'coco', 'scst support on coco only!' if args.scst: assert args.dataset == 'coco', 'scst support on coco only!' assert args.max_pred == 0 and args.mask_prob == 0, 'no mask for scst!' rl_crit = RewardCriterion() if args.enable_butd: assert (args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join( args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' # output config os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) logging.basicConfig( filename=os.path.join(args.output_dir, args.log_file), filemode='w', format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group( backend='nccl', init_method='tcp://localhost:10001', #args.dist_url, world_size=args.world_size, rank=args.global_rank) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # plotting loss, optional if args.enable_visdom: import visdom vis = visdom.Visdom(port=args.visdom_port, env=args.output_dir) vis_window = {'iter': None, 'score': None} tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank)) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.do_train: bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="s2s", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2')) ] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="bi", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2'))) train_dataset = seq2seq_loader.Img2txtDataset( args.src_file, args.image_root, args.split, args.train_batch_size, data_tokenizer, args.max_seq_length, file_valid_jpgs=args.file_valid_jpgs, bi_uni_pipeline=bi_uni_pipeline, use_num_imgs=args.use_num_imgs, s2s_prob=args.s2s_prob, bi_prob=args.bi_prob, enable_butd=args.enable_butd, tasks=args.tasks) if args.world_size == 1: train_sampler = RandomSampler(train_dataset, replacement=False) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=batch_list_to_batch_tensors, pin_memory=True) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) t_total = int( len(train_dataloader) * args.num_train_epochs * 1. / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 relax_projection = 4 if args.relax_projection else 0 task_idx_proj = 3 if args.tasks == 'img2txt' else 0 mask_word_id, eos_word_ids, pad_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[PAD]"]) # index in BERT vocab: 103, 102, 0 if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init assert args.scst == False, 'must init from maximum likelihood training' _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks=args.tasks) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load( os.path.join(args.output_dir, "model.{0}.bin".format(recover_step))) # recover_step == number of epochs global_step = math.floor(recover_step * t_total * 1. / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path) global_step = 0 if not args.scst: model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks=args.tasks) else: model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, max_position_embeddings=args.max_position_embeddings, config_path=args.config_path, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=task_idx_proj, mask_word_id=mask_word_id, search_beam_size=1, eos_id=eos_word_ids, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input) del model_recover torch.cuda.empty_cache() # deprecated # from vlp.resnet import resnet # cnn = resnet(args.resnet_model, _num_layers=101, _fixed_block=4, pretrained=True) # no finetuning if args.fp16: model.half() # cnn.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) # cnn.to(device) if args.local_rank != -1: try: # from apex.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # cnn = DDP(cnn) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # cnn = DataParallelImbalance(cnn) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, schedule=args.sche_mode, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load( os.path.join(args.output_dir, "optim.{0}.bin".format(recover_step))) if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) logger.info(" Loader length = %d", len(train_dataloader)) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, args.num_train_epochs + 1, desc="Epoch"): if args.local_rank >= 0: train_sampler.set_epoch(i_epoch - 1) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') nbatches = len(train_dataloader) train_loss = [] pretext_loss = [] vqa2_loss = [] scst_reward = [] for step, batch in enumerate(iter_bar): batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, img, vis_masked_pos, vis_pe, ans_labels = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() if args.enable_butd: conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data else: conv_feats, _ = cnn(img.data) # Bx2048x7x7 conv_feats = conv_feats.view(conv_feats.size(0), conv_feats.size(1), -1).permute(0, 2, 1).contiguous() if not args.scst: loss_tuple = model( conv_feats, vis_pe, input_ids, segment_ids, input_mask, lm_label_ids, ans_labels, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, vis_masked_pos=vis_masked_pos, mask_image_regions=args.mask_image_regions, drop_worst_ratio=args.max_drop_worst_ratio if i_epoch > args.drop_after else 0) mean_reward = loss_tuple[0].new(1).fill_(0) else: # scst training model.eval() position_ids = torch.arange( input_ids.size(1), dtype=input_ids.dtype, device=input_ids.device).unsqueeze(0).expand_as( input_ids) input_dummy = input_ids[:, :args.len_vis_input + 2] # +2 for [CLS] and [SEP] greedy_res = input_ids.new( input_ids.size(0), input_ids.size(1) - args.len_vis_input - 2).fill_(0) gen_result = input_ids.new( input_ids.size(0), input_ids.size(1) - args.len_vis_input - 2).fill_(0) with torch.no_grad(): greedy_res_raw, _ = model(conv_feats, vis_pe, input_dummy, segment_ids, position_ids, input_mask, task_idx=task_idx, sample_mode='greedy') for b in range(greedy_res_raw.size(0)): for idx in range(greedy_res_raw.size(1)): if greedy_res_raw[b][idx] not in [ eos_word_ids, pad_word_ids ]: greedy_res[b][idx] = greedy_res_raw[b][idx] else: if greedy_res_raw[b][idx] == eos_word_ids: greedy_res[b][idx] = eos_word_ids break model.train() gen_result_raw, sample_logprobs = model( conv_feats, vis_pe, input_dummy, segment_ids, position_ids, input_mask, task_idx=task_idx, sample_mode='sample') for b in range(gen_result_raw.size(0)): for idx in range(gen_result_raw.size(1)): if gen_result_raw[b][idx] not in [ eos_word_ids, pad_word_ids ]: gen_result[b][idx] = gen_result_raw[b][idx] else: if gen_result_raw[b][idx] == eos_word_ids: gen_result[b][idx] = eos_word_ids break gt_ids = input_ids[:, args.len_vis_input + 2:] reward = get_self_critical_reward(greedy_res, gt_ids, gen_result, gt_ids.size(0)) reward = torch.from_numpy(reward).float().to( gen_result.device) mean_reward = reward.mean() loss = rl_crit(sample_logprobs, gen_result.data, reward) loss_tuple = [ loss, loss.new(1).fill_(0.), loss.new(1).fill_(0.) ] # disable pretext_loss_deprecated for now masked_lm_loss, pretext_loss_deprecated, ans_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. For dist, this is done through gradient addition. masked_lm_loss = masked_lm_loss.mean() pretext_loss_deprecated = pretext_loss_deprecated.mean() ans_loss = ans_loss.mean() loss = masked_lm_loss + pretext_loss_deprecated + ans_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) train_loss.append(loss.item()) pretext_loss.append(pretext_loss_deprecated.item()) vqa2_loss.append(ans_loss.item()) scst_reward.append(mean_reward.item()) if step % 100 == 0: logger.info( "Epoch {}, Iter {}, Loss {:.2f}, Pretext {:.2f}, VQA2 {:.2f}, Mean R {:.3f}\n" .format(i_epoch, step, np.mean(train_loss), np.mean(pretext_loss), np.mean(vqa2_loss), np.mean(scst_reward))) if args.enable_visdom: if vis_window['iter'] is None: vis_window['iter'] = vis.line( X=np.tile( np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack( (np.asarray([np.mean(train_loss)]), )), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total'])) else: vis.line(X=np.tile( np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack( (np.asarray([np.mean(train_loss)]), )), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total']), win=vis_window['iter'], update='append') # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "model.{0}.bin".format(i_epoch)) output_optim_file = os.path.join(args.output_dir, "optim.{0}.bin".format(i_epoch)) if args.global_rank in ( -1, 0): # save model if the first device or no dist torch.save( copy.deepcopy(model_to_save).cpu().state_dict(), output_model_file) # torch.save(optimizer.state_dict(), output_optim_file) # disable for now, need to sanitize state and ship everthing back to cpu logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.world_size > 1: torch.distributed.barrier()
def train(args): if args.amp: amp_handle = amp.init(enabled=args.fp16) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) ssd300 = model(args) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) iteration = 0 loss_func = Loss(dboxes) loss_func.cuda() optimizer = torch.optim.SGD( tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR( optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: if args.amp: optimizer = amp_handle.wrap_optimizer(optimizer) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) val_dataloader, inv_map = get_val_dataloader(args) train_loader = get_train_loader(args, dboxes) acc = 0 logger = Logger(args.batch_size, args.local_rank) for epoch in range(0, args.epochs): logger.start_epoch() scheduler.step() iteration = train_loop( ssd300, loss_func, epoch, optimizer, train_loader, iteration, logger, args) logger.end_epoch() if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Epoch {:2d}, Accuracy: {:4f} mAP'.format(epoch, acc)) if args.data_pipeline == 'dali': train_loader.reset() return acc, logger.average_speed()
def test_pytorch_model_default(self): from delira.models import AbstractPyTorchNetwork, UNet2dPyTorch, \ UNet3dPyTorch, ClassificationNetworkBasePyTorch, \ VGG3DClassificationNetworkPyTorch, \ GenerativeAdversarialNetworkBasePyTorch from delira.training.train_utils import create_optims_default_pytorch, \ create_optims_gan_default_pytorch from delira.utils.context_managers import DefaultOptimWrapperTorch import torch test_cases = [ # UNet 2D ( UNet2dPyTorch(5, in_channels=1), # model (1, 32, 32), # input shape (32, 32), # output shape { "loss_fn": torch.nn.CrossEntropyLoss() }, # loss function create_optims_default_pytorch, # optim_fn 4, # output range (num_classes -1) True # half precision ), # UNet 3D ( UNet3dPyTorch(5, in_channels=1), # model (1, 32, 32, 32), # input shape (32, 32, 32), # output shape { "loss_fn": torch.nn.CrossEntropyLoss() }, # loss function create_optims_default_pytorch, # optim_fn 4, # output range (num_classes) - 1 True # half precision ), # Base Classifier (Resnet 18) ( ClassificationNetworkBasePyTorch(1, 10), # model (1, 224, 224), # input shape 9, # output shape (num_classes - 1) { "loss_fn": torch.nn.CrossEntropyLoss() }, # loss function create_optims_default_pytorch, # optim_fn None, # no max_range needed True # half precision ), # 3D VGG ( VGG3DClassificationNetworkPyTorch(1, 10), # model (1, 32, 224, 224), # input shape 9, # output shape (num_classes - 1) { "loss_fn": torch.nn.CrossEntropyLoss() }, # loss function create_optims_default_pytorch, # optim fn None, # no max_range needed True # half precision ), # DCGAN ( GenerativeAdversarialNetworkBasePyTorch(1, 100), # model (1, 64, 64), # input shape (1, 1), # arbitrary shape (not needed) { "loss_fn": torch.nn.MSELoss() }, # loss create_optims_gan_default_pytorch, # optimizer function 1, # standard max range True # half precision ) ] for case in test_cases: with self.subTest(case=case): model, input_shape, output_shape, loss_fn, optim_fn, \ max_range, half_precision = case try: from apex import amp amp_handle = amp.init(half_precision) wrapper_fn = amp_handle.wrap_optimizer except ImportError: wrapper_fn = DefaultOptimWrapperTorch start_time = time.time() # test backward if optimizer fn is not None if optim_fn is not None: optim = { k: wrapper_fn(v, num_loss=len(loss_fn)) for k, v in optim_fn(model, torch.optim.Adam).items() } else: optim = {} closure = model.closure device = torch.device("cpu") model = model.to(device) prepare_batch = model.prepare_batch # classification label: target_shape specifies max label if isinstance(output_shape, int): label = np.asarray( [np.random.randint(output_shape) for i in range(10)]) else: label = np.random.rand(10, *output_shape) * max_range data_dict = { "data": np.random.rand(10, *input_shape), "label": label } try: data_dict = prepare_batch(data_dict, device, device) closure(model, data_dict, optim, loss_fn, {}) except Exception as e: assert False, "Test for %s not passed: Error: %s" \ % (model.__class__.__name__, e) end_time = time.time() print("Time needed for %s: %.3f" % (model.__class__.__name__, end_time - start_time)) del device del optim del closure del prepare_batch del model try: del amp_handle except NameError: pass gc.collect()
type=int, help='number of distributed processes') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='gloo', type=str, help='distributed backend') best_prec1 = 0 args = parser.parse_args() # initialize Amp amp_handle = amp.init(enabled=args.fp16) class SyntheticDataset(torch.utils.data.dataset.Dataset): def __init__(self, input_size, length, num_classes=1000): self.tensor = Variable(torch.rand(*input_size)).type(torch.FloatTensor) self.target = torch.Tensor(1).random_(0, num_classes)[0].type( torch.LongTensor) self.length = length def __getitem__(self, index): return self.tensor, self.target def __len__(self): return self.length
def main(): args = parse_args_train(sys.argv[1:]) # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # for multi-GPUs if args.n_gpus > 1: batch_size = args.batch_size * args.n_gpus accum_grad_n_steps = max(1, args.accum_grad_n_steps // args.n_gpus) else: batch_size = args.batch_size accum_grad_n_steps = args.accum_grad_n_steps # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, shuffle=args.shuffle, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) for s in args.eval_sets ] args.vocab = train_set.vocab # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger set_logger(os.path.join(save_path, 'train.log'), stdout=args.stdout) # Model setting model = build_lm(args, save_path) if not args.resume: # Save conf file as a yaml file save_config(args, os.path.join(save_path, 'conf.yml')) # Save nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(args.items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info('torch version: %s' % str(torch.__version__)) logger.info(model) # Set optimizer if args.resume: resume_epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer( model, 'sgd' if resume_epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) else: resume_epoch = 0 optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = args.lm_type in ['transformer', 'transformer_xl'] scheduler = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.get('transformer_d_model', 0), factor=args.lr_factor, noam=args.optimizer == 'noam', save_checkpoints_topk=10 if is_transformer else 1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, scheduler) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # GPU setting use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp = None if args.n_gpus >= 1: model.cudnn_setting( deterministic=not (is_transformer or args.cudnn_benchmark), benchmark=not is_transformer and args.cudnn_benchmark) model.cuda() # Mixed precision training setting if use_apex: if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): scaler = torch.cuda.amp.GradScaler() else: scaler = None from apex import amp model, scheduler.optimizer = amp.initialize( model, scheduler.optimizer, opt_level=args.train_dtype) amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) else: model = CPUWrapperLM(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() accum_n_steps = 0 n_steps = scheduler.n_steps * accum_grad_n_steps for ep in range(resume_epoch, args.n_epochs): pbar_epoch = tqdm(total=len(train_set)) for ys_train, is_new_epoch in train_set: # Compute loss in the training set accum_n_steps += 1 if accum_n_steps == 1: loss_train = 0 # moving average over gradient accumulation if use_apex and scaler is not None: with torch.cuda.amp.autocast(): loss, hidden, observation = model(ys_train, state=hidden) else: loss, hidden, observation = model(ys_train, state=hidden) loss = loss / accum_grad_n_steps reporter.add(observation) if use_apex: if scaler is not None: scaler.scale(loss).backward() else: with amp.scale_loss(loss, scheduler.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() loss.detach() # Truncate the graph if accum_n_steps >= accum_grad_n_steps or is_new_epoch: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) if use_apex and scaler is not None: scaler.step(scheduler.optimizer) scaler.update() scheduler.step(skip_optimizer=True) # update lr only else: scheduler.step() scheduler.zero_grad() accum_n_steps = 0 # NOTE: parameters are forcibly updated at the end of every epoch loss_train += loss.item() del loss hidden = model.module.repackage_state(hidden) pbar_epoch.update(ys_train.shape[0] * (ys_train.shape[1] - 1)) reporter.add_tensorboard_scalar('learning_rate', scheduler.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() n_steps += 1 # NOTE: n_steps is different from the step counter in Noam Optimizer if n_steps % args.print_step == 0: # Compute loss in the dev set ys_dev = iter(dev_set).next(bptt=args.bptt)[0] loss, _, observation = model(ys_dev, state=None, is_eval=True) reporter.add(observation, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step logger.info( "step:%d(ep:%.2f) loss:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (n_steps, scheduler.n_epochs + train_set.epoch_detail, loss_train, loss_dev, scheduler.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() # Save figures of loss and accuracy if n_steps % (args.print_step * 10) == 0: reporter.snapshot() model.module.plot_attention() if is_new_epoch: break # Save checkpoint and evaluate model per epoch duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (scheduler.n_epochs + 1, duration_epoch / 60)) if scheduler.n_epochs + 1 < args.eval_start_epoch: scheduler.epoch() # lr decay reporter.epoch() # plot # Save model scheduler.save_checkpoint(model, save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) else: start_time_eval = time.time() # dev model.module.reset_length(args.bptt) ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) scheduler.epoch(ppl_dev) # lr decay reporter.epoch(ppl_dev, name='perplexity') # plot logger.info('PPL (%s, ep:%d): %.2f' % (dev_set.set, scheduler.n_epochs, ppl_dev)) if scheduler.is_topk or is_transformer: # Save model scheduler.save_checkpoint(model, save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) # test ppl_test_avg = 0. for eval_set in eval_sets: model.module.reset_length(args.bptt) ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) logger.info('PPL (%s, ep:%d): %.2f' % (eval_set.set, scheduler.n_epochs, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info( 'PPL (avg., ep:%d): %.2f' % (scheduler.n_epochs, ppl_test_avg / len(eval_sets))) duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if scheduler.is_early_stop: break # Convert to fine-tuning stage if scheduler.n_epochs == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) if scheduler.n_epochs >= args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None) parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() # GPU 数量, 根据 GPU 数量判断是否使用分布式计算 num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 # 分布式计算 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # 用实际的 config_file 覆盖默认的 config # NeuralNetwork/config/default 是默认的 config, 训练前被实际的 config 覆盖, 使用修改后的 config 训练 # 实际的 config 在 configs 下 cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # 冻结所有的配置项, 防止修改 # 保存在 output_dir 下的 log.txt 文件 save_dir = "" logger = setup_logger("NeuralNetwork", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) # 加载模型 model = build_detection_model(cfg) # 使用cuda或者cpu device = torch.device(cfg.MODEL.DEVICE) model.to(device) # 将模型存入GPU或者cpu # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) # 输出文件夹 output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) # 添加 bounding box 和 mask segmentation iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) # 根据数据集的数量定义输出文件夹 output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST # 创建输出文件夹 if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder # 加载测试数据集 data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) # 对数据集中的数据按批次调用 inference 函数 for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def train(self, args): # Reset amp if args.use_apex: from apex import amp amp.init(False) # Get dataloaders train_dataloader = ds_utils.get_dataloader(args, 'train') if not args.skip_test: test_dataloader = ds_utils.get_dataloader(args, 'test') model = runner = self.runner if args.use_half: runner.half() # Initialize optimizers, schedulers and apex opts = runner.get_optimizers(args) # Load pre-trained params for optimizers and schedulers (if needed) if args.which_epoch != 'none' and not args.init_experiment_dir: for net_name, opt in opts.items(): opt.load_state_dict(torch.load(self.checkpoints_dir / f'{args.which_epoch}_opt_{net_name}.pth', map_location='cpu')) if args.use_apex and args.num_gpus > 0 and args.num_gpus <= 8: # Enfornce apex mixed precision settings nets_list, opts_list = [], [] for net_name in sorted(opts.keys()): nets_list.append(runner.nets[net_name]) opts_list.append(opts[net_name]) loss_scale = float(args.amp_loss_scale) if args.amp_loss_scale != 'dynamic' else args.amp_loss_scale nets_list, opts_list = amp.initialize(nets_list, opts_list, opt_level=args.amp_opt_level, num_losses=1, loss_scale=loss_scale) # Unpack opts_list into optimizers for net_name, net, opt in zip(sorted(opts.keys()), nets_list, opts_list): runner.nets[net_name] = net opts[net_name] = opt if args.which_epoch != 'none' and not args.init_experiment_dir and os.path.exists(self.checkpoints_dir / f'{args.which_epoch}_amp.pth'): amp.load_state_dict(torch.load(self.checkpoints_dir / f'{args.which_epoch}_amp.pth', map_location='cpu')) # Initialize apex distributed data parallel wrapper if args.num_gpus > 1 and args.num_gpus <= 8: from apex import parallel model = parallel.DistributedDataParallel(runner, delay_allreduce=True) epoch_start = 1 if args.which_epoch == 'none' else int(args.which_epoch) + 1 # Initialize logging train_iter = epoch_start - 1 if args.visual_freq != -1: train_iter /= args.visual_freq logger = Logger(args, self.experiment_dir) logger.set_num_iter( train_iter=train_iter, test_iter=(epoch_start - 1) // args.test_freq) if args.debug and not args.use_apex: torch.autograd.set_detect_anomaly(True) total_iters = 1 for epoch in range(epoch_start, args.num_epochs + 1): if args.rank == 0: print('epoch %d' % epoch) # Train for one epoch model.train() time_start = time.time() # Shuffle the dataset before the epoch train_dataloader.dataset.shuffle() for i, data_dict in enumerate(train_dataloader, 1): # Prepare input data if args.num_gpus > 0 and args.num_gpus > 0: for key, value in data_dict.items(): data_dict[key] = value.cuda() # Convert inputs to FP16 if args.use_half: for key, value in data_dict.items(): data_dict[key] = value.half() output_logs = i == len(train_dataloader) if args.visual_freq != -1: output_logs = not (total_iters % args.visual_freq) output_visuals = output_logs and not args.no_disk_write_ops # Accumulate list of optimizers that will perform opt step for opt in opts.values(): opt.zero_grad() # Perform a forward pass if not args.use_closure: loss = model(data_dict) closure = None if args.use_apex and args.num_gpus > 0 and args.num_gpus <= 8: # Mixed precision requires a special wrapper for the loss with amp.scale_loss(loss, opts.values()) as scaled_loss: scaled_loss.backward() elif not args.use_closure: loss.backward() else: def closure(): loss = model(data_dict) loss.backward() return loss # Perform steps for all optimizers for opt in opts.values(): opt.step(closure) if output_logs: logger.output_logs('train', runner.output_visuals(), runner.output_losses(), time.time() - time_start) if args.debug: break if args.visual_freq != -1: total_iters += 1 total_iters %= args.visual_freq # Increment the epoch counter in the training dataset train_dataloader.dataset.epoch += 1 # If testing is not required -- continue if epoch % args.test_freq: continue # If skip test flag is set -- only check if a checkpoint if required if not args.skip_test: # Calculate "standing" stats for the batch normalization if args.calc_stats: runner.calculate_batchnorm_stats(train_dataloader, args.debug) # Test time_start = time.time() model.eval() for data_dict in test_dataloader: # Prepare input data if args.num_gpus > 0: for key, value in data_dict.items(): data_dict[key] = value.cuda() # Forward pass with torch.no_grad(): model(data_dict) if args.debug: break # Output logs logger.output_logs('test', runner.output_visuals(), runner.output_losses(), time.time() - time_start) # If creation of checkpoint is not required -- continue if epoch % args.checkpoint_freq and not args.debug: continue # Create or load a checkpoint if args.rank == 0 and not args.no_disk_write_ops: with torch.no_grad(): for net_name in runner.nets_names_to_train: # Save a network torch.save(runner.nets[net_name].state_dict(), self.checkpoints_dir / f'{epoch}_{net_name}.pth') # Save an optimizer torch.save(opts[net_name].state_dict(), self.checkpoints_dir / f'{epoch}_opt_{net_name}.pth') # Save amp if args.use_apex: torch.save(amp.state_dict(), self.checkpoints_dir / f'{epoch}_amp.pth') return runner
def test_bce_is_float_with_allow_banned(self): self.handle._deactivate() self.handle = amp.init(enabled=True, allow_banned=True) assertion = lambda fn, x: self.assertEqual(fn(x).type(), FLOAT) self.bce_common(assertion)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def _setup(self, network, optim_fn, optimizer_cls, optimizer_params, lr_scheduler_cls, lr_scheduler_params, gpu_ids, mixed_precision, mixed_precision_kwargs): """ Defines the Trainers Setup Parameters ---------- network : :class:`AbstractPyTorchNetwork` the network to train optim_fn : function creates a dictionary containing all necessary optimizers optimizer_cls : subclass of torch.optim.Optimizer optimizer class implementing the optimization algorithm of choice optimizer_params : dict lr_scheduler_cls : Any learning rate schedule class: must implement step() method lr_scheduler_params : dict keyword arguments passed to lr scheduler during construction gpu_ids : list list containing ids of GPUs to use; if empty: use cpu instead mixed_precision : bool whether to use mixed precision or not (False per default) mixed_precision_kwargs : dict additional keyword arguments for mixed precision """ try: from apex import amp self._amp_handle = amp.init(mixed_precision, *mixed_precision_kwargs) wrap_fn = self._amp_handle.wrap_optimizer except ImportError: if mixed_precision: logger.warning( "Apex was not found found, trying to continue \ in full precision instead") from ..utils.context_managers import DefaultOptimWrapperTorch wrap_fn = DefaultOptimWrapperTorch # wrap optimizers by half_precision_optimizer via apex if necessary self.optimizers = { k: wrap_fn(v, num_loss=len(self.criterions)) for k, v in optim_fn(network, optimizer_cls, ** optimizer_params).items() } # schedulers if lr_scheduler_cls is not None: for key, optim in self.optimizers.items(): if not issubclass(lr_scheduler_cls, AbstractCallback): logger.warning("lr_scheduler_cls is not a callback.") # access actual optimizer by calling wrapped optimizer from wrapper self.register_callback( lr_scheduler_cls(optim._optimizer, **lr_scheduler_params)) # asssign closure and prepare batch from network self.closure_fn = network.closure self._prepare_batch = network.prepare_batch if gpu_ids and torch.cuda.is_available(): self.use_gpu = True if (len(gpu_ids) > 1) and (torch.cuda.device_count() > 1): # use GPU 0 as default input GPU self.input_device = torch.device("cuda:%d" % gpu_ids[0]) # Train on multiple GPUs and use GPU 0 as output device self.module = torch.nn.DataParallel( network.to(self.input_device), device_ids=gpu_ids, output_device=gpu_ids[1]) # use GPU 1 as default output GPU for balanced GPU usage self.output_device = torch.device("cuda:%d" % gpu_ids[1]) else: # use the only available GPU as input device self.input_device = torch.device("cuda:%d" % gpu_ids[0]) self.module = network.to(self.input_device) # use GPU 0 as output device as output device self.output_device = torch.device("cuda:%d" % gpu_ids[0]) else: self.use_gpu = False self.input_device = torch.device("cpu") self.output_device = torch.device("cpu") self.module = network.to(self.input_device)
def main(): args = parse_args_train(sys.argv[1:]) args_init = copy.deepcopy(args) args_teacher = copy.deepcopy(args) # Load a conf file if args.resume: conf = load_config(os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) recog_params = vars(args) args = compute_susampling_factor(args) # Load dataset batch_size = args.batch_size * args.n_gpus if args.n_gpus >= 1 else args.batch_size train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, tsv_path_sub1=args.train_set_sub1, tsv_path_sub2=args.train_set_sub2, dict_path=args.dict, dict_path_sub1=args.dict_sub1, dict_path_sub2=args.dict_sub2, nlsyms=args.nlsyms, unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, wp_model=args.wp_model, wp_model_sub1=args.wp_model_sub1, wp_model_sub2=args.wp_model_sub2, batch_size=batch_size, n_epochs=args.n_epochs, min_n_frames=args.min_n_frames, max_n_frames=args.max_n_frames, shuffle_bucket=args.shuffle_bucket, sort_by='input', short2long=args.sort_short2long, sort_stop_epoch=args.sort_stop_epoch, dynamic_batching=args.dynamic_batching, ctc=args.ctc_weight > 0, ctc_sub1=args.ctc_weight_sub1 > 0, ctc_sub2=args.ctc_weight_sub2 > 0, subsample_factor=args.subsample_factor, subsample_factor_sub1=args.subsample_factor_sub1, subsample_factor_sub2=args.subsample_factor_sub2, discourse_aware=args.discourse_aware) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, tsv_path_sub1=args.dev_set_sub1, tsv_path_sub2=args.dev_set_sub2, dict_path=args.dict, dict_path_sub1=args.dict_sub1, dict_path_sub2=args.dict_sub2, nlsyms=args.nlsyms, unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, wp_model=args.wp_model, wp_model_sub1=args.wp_model_sub1, wp_model_sub2=args.wp_model_sub2, batch_size=batch_size, min_n_frames=args.min_n_frames, max_n_frames=args.max_n_frames, ctc=args.ctc_weight > 0, ctc_sub1=args.ctc_weight_sub1 > 0, ctc_sub2=args.ctc_weight_sub2 > 0, subsample_factor=args.subsample_factor, subsample_factor_sub1=args.subsample_factor_sub1, subsample_factor_sub2=args.subsample_factor_sub2) eval_sets = [Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, is_test=True) for s in args.eval_sets] args.vocab = train_set.vocab args.vocab_sub1 = train_set.vocab_sub1 args.vocab_sub2 = train_set.vocab_sub2 args.input_dim = train_set.input_dim # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_asr_model_name(args) if args.mbr_training: assert args.asr_init save_path = mkdir_join(os.path.dirname(args.asr_init), dir_name) else: save_path = mkdir_join(args.model_save_dir, '_'.join( os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger set_logger(os.path.join(save_path, 'train.log'), stdout=args.stdout) # Load a LM conf file for LM fusion & LM initialization if not args.resume and args.external_lm: lm_conf = load_config(os.path.join(os.path.dirname(args.external_lm), 'conf.yml')) args.lm_conf = argparse.Namespace() for k, v in lm_conf.items(): setattr(args.lm_conf, k, v) assert args.unit == args.lm_conf.unit assert args.vocab == args.lm_conf.vocab # Model setting model = Speech2Text(args, save_path, train_set.idx2token[0]) if not args.resume: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) if args.external_lm: save_config(args.lm_conf, os.path.join(save_path, 'conf_lm.yml')) # Save the nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) for sub in ['', '_sub1', '_sub2']: if getattr(args, 'dict' + sub): shutil.copy(getattr(args, 'dict' + sub), os.path.join(save_path, 'dict' + sub + '.txt')) if getattr(args, 'unit' + sub) == 'wp': shutil.copy(getattr(args, 'wp_model' + sub), os.path.join(save_path, 'wp' + sub + '.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Initialize with pre-trained model's parameters if args.asr_init: # Load the ASR model (full model) assert os.path.isfile(args.asr_init), 'There is no checkpoint.' conf_init = load_config(os.path.join(os.path.dirname(args.asr_init), 'conf.yml')) for k, v in conf_init.items(): setattr(args_init, k, v) model_init = Speech2Text(args_init) load_checkpoint(args.asr_init, model_init) # Overwrite parameters param_dict = dict(model_init.named_parameters()) for n, p in model.named_parameters(): if n in param_dict.keys() and p.size() == param_dict[n].size(): if args.asr_init_enc_only and 'enc' not in n: continue p.data = param_dict[n].data logger.info('Overwrite %s' % n) # Set optimizer resume_epoch = 0 if args.resume: resume_epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer(model, 'sgd' if resume_epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) else: optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = 'former' in args.enc_type or args.dec_type == 'former' optimizer = LRScheduler(optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, lower_better=args.metric not in ['accuracy', 'bleu'], warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=getattr(args, 'transformer_d_model', 0), factor=args.lr_factor, noam=args.optimizer == 'noam', save_checkpoints_topk=10 if is_transformer else 1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, optimizer) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # Load the teacher ASR model teacher = None if args.teacher: assert os.path.isfile(args.teacher), 'There is no checkpoint.' conf_teacher = load_config(os.path.join(os.path.dirname(args.teacher), 'conf.yml')) for k, v in conf_teacher.items(): setattr(args_teacher, k, v) # Setting for knowledge distillation args_teacher.ss_prob = 0 args.lsm_prob = 0 teacher = Speech2Text(args_teacher) load_checkpoint(args.teacher, teacher) # Load the teacher LM teacher_lm = None if args.teacher_lm: assert os.path.isfile(args.teacher_lm), 'There is no checkpoint.' conf_lm = load_config(os.path.join(os.path.dirname(args.teacher_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) teacher_lm = build_lm(args_lm) load_checkpoint(args.teacher_lm, teacher_lm) # GPU setting use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp = None if args.n_gpus >= 1: model.cudnn_setting(deterministic=not (is_transformer or args.cudnn_benchmark), benchmark=args.cudnn_benchmark) model.cuda() # Mix precision training setting if use_apex: from apex import amp model, optimizer.optimizer = amp.initialize(model, optimizer.optimizer, opt_level=args.train_dtype) from neural_sp.models.seq2seq.decoders.ctc import CTC amp.register_float_function(CTC, "loss_fn") # NOTE: see https://github.com/espnet/espnet/pull/1779 amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) if teacher is not None: teacher.cuda() if teacher_lm is not None: teacher_lm.cuda() else: model = CPUWrapperASR(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) if args.mtl_per_batch: # NOTE: from easier to harder tasks tasks = [] if 1 - args.bwd_weight - args.ctc_weight - args.sub1_weight - args.sub2_weight > 0: tasks += ['ys'] if args.bwd_weight > 0: tasks = ['ys.bwd'] + tasks if args.ctc_weight > 0: tasks = ['ys.ctc'] + tasks if args.mbr_ce_weight > 0: tasks = ['ys.mbr'] + tasks for sub in ['sub1', 'sub2']: if getattr(args, 'train_set_' + sub): if getattr(args, sub + '_weight') - getattr(args, 'ctc_weight_' + sub) > 0: tasks = ['ys_' + sub] + tasks if getattr(args, 'ctc_weight_' + sub) > 0: tasks = ['ys_' + sub + '.ctc'] + tasks else: tasks = ['all'] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() pbar_epoch = tqdm(total=len(train_set)) accum_n_steps = 0 n_steps = optimizer.n_steps * args.accum_grad_n_steps epoch_detail_prev = 0 session_prev = None while True: # Compute loss in the training set batch_train, is_new_epoch = train_set.next() if args.discourse_aware and batch_train['sessions'][0] != session_prev: model.module.reset_session() session_prev = batch_train['sessions'][0] accum_n_steps += 1 # Change mini-batch depending on task if accum_n_steps == 1: loss_train = 0 # moving average over gradient accumulation for task in tasks: loss, observation = model(batch_train, task, teacher=teacher, teacher_lm=teacher_lm) reporter.add(observation) if use_apex: with amp.scale_loss(loss, optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() loss.detach() # Trancate the graph loss_train = (loss_train * (accum_n_steps - 1) + loss.item()) / accum_n_steps if accum_n_steps >= args.accum_grad_n_steps or is_new_epoch: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_steps = 0 # NOTE: parameters are forcibly updated at the end of every epoch del loss pbar_epoch.update(len(batch_train['utt_ids'])) reporter.add_tensorboard_scalar('learning_rate', optimizer.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() n_steps += 1 # NOTE: n_steps is different from the step counter in Noam Optimizer if n_steps % args.print_step == 0: # Compute loss in the dev set batch_dev = dev_set.next(batch_size=1 if 'transducer' in args.dec_type else None)[0] # Change mini-batch depending on task for task in tasks: loss, observation = model(batch_dev, task, is_eval=True) reporter.add(observation, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step if args.input_type == 'speech': xlen = max(len(x) for x in batch_train['xs']) ylen = max(len(y) for y in batch_train['ys']) elif args.input_type == 'text': xlen = max(len(x) for x in batch_train['ys']) ylen = max(len(y) for y in batch_train['ys_sub1']) logger.info("step:%d(ep:%.2f) loss:%.3f(%.3f)/lr:%.7f/bs:%d/xlen:%d/ylen:%d (%.2f min)" % (n_steps, optimizer.n_epochs + train_set.epoch_detail, loss_train, loss_dev, optimizer.lr, len(batch_train['utt_ids']), xlen, ylen, duration_step / 60)) start_time_step = time.time() # Save fugures of loss and accuracy if n_steps % (args.print_step * 10) == 0: reporter.snapshot() model.module.plot_attention() model.module.plot_ctc() # Ealuate model every 0.1 epoch during MBR training if args.mbr_training: if int(train_set.epoch_detail * 10) != int(epoch_detail_prev * 10): # dev evaluate([model.module], dev_set, recog_params, args, int(train_set.epoch_detail * 10) / 10, logger) # Save the model optimizer.save_checkpoint( model, save_path, remove_old=False, amp=amp, epoch_detail=train_set.epoch_detail) epoch_detail_prev = train_set.epoch_detail # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: optimizer.epoch() # lr decay reporter.epoch() # plot # Save the model optimizer.save_checkpoint( model, save_path, remove_old=not is_transformer, amp=amp) else: start_time_eval = time.time() # dev metric_dev = evaluate([model.module], dev_set, recog_params, args, optimizer.n_epochs + 1, logger) optimizer.epoch(metric_dev) # lr decay reporter.epoch(metric_dev, name=args.metric) # plot if optimizer.is_topk or is_transformer: # Save the model optimizer.save_checkpoint( model, save_path, remove_old=not is_transformer, amp=amp) # test if optimizer.is_topk: for eval_set in eval_sets: evaluate([model.module], eval_set, recog_params, args, optimizer.n_epochs, logger) duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) pbar_epoch = tqdm(total=len(train_set)) session_prev = None if optimizer.n_epochs >= args.n_epochs: break # if args.ss_prob > 0: # model.module.scheduled_sampling_trigger() start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): parser = argparse.ArgumentParser() # General parser.add_argument("--bert_model", default="bert-base-cased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") # For decoding parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Forbid the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=None, type=int) parser.add_argument('--ngram_size', type=int, default=3) parser.add_argument('--max_tgt_length', type=int, default=20, help="maximum length of target sequence") # Others for VLP parser.add_argument("--src_file", default='/mnt/dat/COCO/annotations/dataset_coco.json', type=str, help="The input data file name.") parser.add_argument("--ref_file", default='pythia/data/v2_mscoco_val2014_annotations.json', type=str, help="The annotation reference file name.") parser.add_argument('--dataset', default='coco', type=str, help='coco | flickr30k | cc') parser.add_argument('--len_vis_input', type=int, default=100) # parser.add_argument('--resnet_model', type=str, default='imagenet_weights/resnet101.pth') parser.add_argument('--image_root', type=str, default='/mnt/dat/COCO/images') parser.add_argument('--split', type=str, default='val') parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--enable_butd', action='store_true', help='set to take in region features') parser.add_argument('--region_bbox_file', default='coco_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str) parser.add_argument('--region_det_file_prefix', default='feat_cls_1000/coco_detection_vg_100dets_gvd_checkpoint_trainval', type=str) parser.add_argument("--output_dir", default='tmp', type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument('--file_valid_jpgs', default='', type=str) args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) if args.enable_butd: assert(args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join(args.image_root, args.region_det_file_prefix) if args.dataset in ('cc', 'coco') and args.region_det_file_prefix != '' else '' device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) args.max_seq_length = args.max_tgt_length + args.len_vis_input + 3 # +3 for 2x[SEP] and [CLS] tokenizer.max_len = args.max_seq_length bi_uni_pipeline = [] bi_uni_pipeline = [seq2seq_loader.Preprocess4Seq2seq(0, 0, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={'max_len_a': args.len_vis_input, 'max_len_b': args.max_tgt_length, 'trunc_seg': 'b', 'always_truncate_tail': True}, mode="bi", len_vis_input=args.len_vis_input, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, load_vqa_ann=True)] amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 logger.info('Attempting to recover models from: {}'.format(args.model_recover_path)) if 0 == len(glob.glob(args.model_recover_path.strip())): logger.error('There are no models to recover. The program will exit.') sys.exit(1) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=0, max_position_embeddings=512, cache_dir=args.output_dir+'/.pretrained_model_{}'.format(-1), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks='vqa2') del model_recover # deprecated # from vlp.resnet import resnet # cnn = resnet(args.resnet_model, _num_layers=101, _fixed_block=4, pretrained=True) # no finetuning if args.fp16: model.half() # cnn.half() model.to(device) # cnn.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # cnn = torch.nn.DataParallel(cnn) torch.cuda.empty_cache() model.eval() # cnn.eval() eval_lst = [] img_dat = np.load(args.src_file, allow_pickle=True) img_idx = 0 for i in range(1, img_dat.shape[0]): if args.enable_butd: src_tk = os.path.join(args.image_root, img_dat[i]['image_name'].split('_')[1], img_dat[i]['feature_path']) else: raise NotImplementedError tgt_tk = tokenizer.tokenize(img_dat[i]['question_str']) eval_lst.append((img_idx, src_tk, tgt_tk, img_dat[i]['question_id'])) img_idx += 1 input_lines = eval_lst next_i = 0 output_lines = [""] * len(input_lines) score_trace_list = [None] * len(input_lines) total_batch = math.ceil(len(input_lines) / args.batch_size) predictions = [] print('start the VQA evaluation...') with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf = [(x[1], x[2]) for x in _chunk] buf_id = [(x[0], x[3]) for x in _chunk] next_i += args.batch_size instances = [] for instance in buf: for proc in bi_uni_pipeline: instances.append(proc(instance[:2]+({'answers': ['dummy']},))) with torch.no_grad(): batch = batch_list_to_batch_tensors( instances) batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, img, vis_masked_pos, vis_pe, _ = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() if args.enable_butd: conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data else: conv_feats, _ = cnn(img.data) # Bx2048x7x7 conv_feats = conv_feats.view(conv_feats.size(0), conv_feats.size(1), -1).permute(0,2,1).contiguous() ans_idx = model(conv_feats, vis_pe, input_ids, segment_ids, input_mask, lm_label_ids, None, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, vis_masked_pos=vis_masked_pos, drop_worst_ratio=0, vqa_inference=True) for ind, (eval_idx, ques_id) in enumerate(buf_id): predictions.append({'question_id': ques_id, 'answer': bi_uni_pipeline[0].ans_proc.idx2word(ans_idx[ind])}) pbar.update(1) results_file = os.path.join(args.output_dir, 'vqa2-results-'+args.model_recover_path.split('/')[-2]+'-'+args.split+'-'+args.model_recover_path.split('/')[-1].split('.')[-2]+'.json') json.dump(predictions, open(results_file, 'w')) if args.split == 'test2015': print('*'*80) print('[WARNING] Evaluation unavailable for the test set!\ \n Please submit your saved JSON file named\ \n `{}`\ \n to the VQA 2.0 server:\ \n https://evalai.cloudcv.org/web/challenges/challenge-page/163/submission'.format(results_file)) print('*'*80) else: import subprocess print('Evaluating result file {}'.format(results_file)) subprocess.Popen(['python', 'pythia/pythia/legacy/eval_model/eval_demo.py', args.ref_file, results_file])