def __init__(self, **kwargs): super(WordPieceVectorizer1D, self).__init__(kwargs.get('transform_fn')) global BERT_TOKENIZER self.max_seen = 128 handle = kwargs.get('embed_file') if BERT_TOKENIZER is None: BERT_TOKENIZER = BertTokenizer.from_pretrained(handle) self.tokenizer = BERT_TOKENIZER self.mxlen = kwargs.get('mxlen', -1)
def __init__(self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, max_pieces: int = 512) -> None: bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lowercase) super().__init__(vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces)
def __init__(self, name, **kwargs): super(BERTBaseEmbeddings, self).__init__(name=name, **kwargs) global BERT_TOKENIZER self.dsz = kwargs.get('dsz') if BERT_TOKENIZER is None: BERT_TOKENIZER = BertTokenizer.from_pretrained(kwargs.get('embed_file')) self.model = BertModel.from_pretrained(kwargs.get('embed_file')) self.vocab = BERT_TOKENIZER.vocab self.vsz = len(BERT_TOKENIZER.vocab) # 30522 self.model.embeddings.word_embeddings.num_embeddings self.layer_indices = kwargs.get('layers', [-1, -2, -3, -4]) self.operator = kwargs.get('operator', 'concat')
def __init__(self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, never_lowercase: List[str] = None, max_pieces: int = 512) -> None: if pretrained_model.endswith("-cased") and do_lowercase: logger.warning("Your BERT model appears to be cased, " "but your indexer is lowercasing tokens.") elif pretrained_model.endswith("-uncased") and not do_lowercase: logger.warning("Your BERT model appears to be uncased, " "but your indexer is not lowercasing tokens.") bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=do_lowercase) super().__init__(vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces, do_lowercase=do_lowercase, never_lowercase=never_lowercase, start_tokens=["[CLS]"], end_tokens=["[SEP]"])
def main(): args = process_args() os.makedirs(args.output_dir, exist_ok=True) if args.enable_butd: if args.visdial_v == '1.0': assert (args.len_vis_input == 36) elif args.visdial_v == '0.9': assert (args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join(args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) # for generative, there is no [SEP] at the end args.max_seq_length = args.len_vis_input + 2 + args.max_len_hist_ques + 2 + args.max_len_ans tokenizer.max_len = args.max_seq_length bi_uni_pipeline = [Preprocess4TestVisdialGen(list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={'len_vis_input': args.len_vis_input, 'max_len_hist_ques': args.max_len_hist_ques, 'max_len_ans': args.max_len_ans}, mode="s2s", region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, image_features_hdfpath=args.image_features_hdfpath, visdial_v=args.visdial_v, pad_hist=args.pad_hist, inc_full_hist=args.inc_full_hist)] amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 mask_word_id, eos_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]"]) logger.info('Attempting to recover models from: {}'.format(args.model_recover_path)) if 0 == len(glob.glob(args.model_recover_path.strip())): logger.error('There are no models to recover. The program will exit.') sys.exit(1) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForVisDialGen.from_pretrained(args.bert_model, max_position_embeddings=512, config_path=args.config_path, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, ngram_size=args.ngram_size, min_len=args.min_len, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tokenizer=tokenizer, decode_verbose=args.decode_verbose, visdial_v=args.visdial_v) del model_recover if args.fp16: model.half() # cnn.half() model.to(device) # cnn.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # cnn = torch.nn.DataParallel(cnn) torch.cuda.empty_cache() model.eval() def read_data(src_file): eval_lst = [] with open(src_file, "r", encoding='utf-8') as f_src: data = json.load(f_src)['data'] dialogs = data['dialogs'] questions = data['questions'] answers = data['answers'] img_idx = 0 for dialog in tqdm(dialogs): if img_idx < args.use_num_imgs or args.use_num_imgs == -1: img_id = dialog['image_id'] cap_tokens = tokenizer.tokenize(dialog['caption']) ques_id = [item['question'] for item in dialog['dialog']] ques_tokens = [tokenizer.tokenize(questions[id] + '?') for id in ques_id] ans_id = [item['answer'] for item in dialog['dialog']] ans_tokens = [tokenizer.tokenize(answers[id]) for id in ans_id] gt_id = [item['gt_index'] for item in dialog['dialog']] ans_opts = [item['answer_options'] for item in dialog['dialog']] ans_opts_tokens = [[tokenizer.tokenize(answers[id]) for id in ans] for ans in ans_opts] assert len(ques_tokens) == len(ans_tokens) == len(ans_opts_tokens) == 10, \ "ques num: %d, ans num: %d, ans opt num: %d" % ( len(ques_tokens), len(ans_tokens), len(ans_opts_tokens)) assert all( [len(ans_opt) == 100 for ans_opt in ans_opts_tokens]), "all the answer have 100 options" eval_lst.append( (img_id, cap_tokens, ques_tokens, ans_tokens, ans_opts_tokens, gt_id)) img_idx += 1 return eval_lst def get_gt_rel_dict(fname): gt_rel_dict = {} gt_rel_data = json.load(open(fname)) for item in gt_rel_data: image_id = item['image_id'] round_id = item['round_id'] gt_relevance = item['gt_relevance'] # each image only at most has one turn having dense annotation if image_id not in gt_rel_dict: gt_rel_dict[image_id] = (round_id, gt_relevance) return gt_rel_dict if args.gt_rel_file != '': gt_rel_dict = get_gt_rel_dict(args.gt_rel_file) input_lines = read_data(args.src_file) next_i = 0 total_batch = math.ceil(len(input_lines) / args.batch_size) print('start the visdial decode evaluation...') t0 = time.time() ranks_json = [] sparse_metrics = SparseGTMetrics() ndcg = NDCG() with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[:-1] for x in _chunk] buf_gt_id = [x[-1] for x in _chunk] next_i += args.batch_size instances = [] for instance in buf: instances.append(bi_uni_pipeline[0](instance)) with torch.no_grad(): buf_gt_id = torch.tensor(buf_gt_id).long().to(device) batch_data = list(zip(*instances)) img, vis_pe = (torch.stack(x).to(device) for x in batch_data[-2:]) task_idx = torch.tensor(batch_data[-3], dtype=torch.long).to(device) conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data output_scores_turn = [] # (input_ids_turn, segment_ids_turn, position_ids_turn, ans_ids_turn, ans_opts_ids_turn, # input_mask_turn, self.task_idx, img, vis_pe) input_ids_turns = [[x[turn_i] for x in batch_data[0]] for turn_i in range(10)] segment_ids_turns = [[x[turn_i] for x in batch_data[1]] for turn_i in range(10)] position_ids_turns = [[x[turn_i] for x in batch_data[2]] for turn_i in range(10)] ans_ids_turns = [[x[turn_i] for x in batch_data[3]] for turn_i in range(10)] ans_opts_ids_turns = [[x[turn_i] for x in batch_data[4]] for turn_i in range(10)] input_mask_turns = [[x[turn_i] for x in batch_data[5]] for turn_i in range(10)] for turn_i in range(10): input_ids = torch.tensor(input_ids_turns[turn_i], dtype=torch.long).to(device) segment_ids = torch.tensor(segment_ids_turns[turn_i], dtype=torch.long).to(device) position_ids = torch.tensor(position_ids_turns[turn_i], dtype=torch.long).to(device) ans_ids = torch.tensor(ans_ids_turns[turn_i], dtype=torch.long).to(device) ans_opts_ids = torch.tensor(ans_opts_ids_turns[turn_i], dtype=torch.long).to(device) input_mask = torch.stack(input_mask_turns[turn_i]).to(device) output_scores = model(conv_feats, vis_pe, input_ids, segment_ids, position_ids, input_mask, ans_ids, ans_opts_ids, task_idx=task_idx) output_scores = output_scores[-1] # [batch_size, num_options] output_scores_turn.append(output_scores) output_scores_turn = torch.stack(output_scores_turn, 1) # [batch_size, num_rounds, num_options] ranks = scores_to_ranks(output_scores_turn) # output_scores_turn_cheat = output_scores_turn.scatter_(2, buf_gt_id.unsqueeze(2), 100.0) sparse_metrics.observe(output_scores_turn, buf_gt_id) for i in range(len(buf_id)): # Cast into types explicitly to ensure no errors in schema. # Round ids are 1-10, not 0-9 if args.split == "val": for j in range(10): ranks_json.append( { "image_id": buf_id[i], "round_id": int(j + 1), "ranks": [rank.item() for rank in ranks[i][j]], } ) if args.gt_rel_file: scores = [] gt_rels = [] for i in range(len(buf_id)): if buf_id[i] in gt_rel_dict: turn_idx, gt_rel = gt_rel_dict[buf_id[i]] scores.append(output_scores_turn[i, turn_idx - 1, :]) gt_rels.append(torch.tensor(gt_rel, dtype=torch.float32).to(device)) scores = torch.stack(scores) gt_rels = torch.stack(gt_rels) ndcg.observe(scores, gt_rels) pbar.update(1) json.dump(ranks_json, open(args.save_ranks_path, "w")) logger.info("Finish writing rankings into %s" % (args.save_ranks_path)) if args.split == "val": fw = open(args.save_ranks_path.replace('.json', '_results.txt'), "w") all_metrics = {} all_metrics.update(sparse_metrics.retrieve(reset=True)) if args.gt_rel_file: all_metrics.update(ndcg.retrieve(reset=True)) for metric_name, metric_value in all_metrics.items(): print(f"{metric_name}: {metric_value}") fw.write("%s: %.6f\n" % (metric_name, metric_value))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") # decoding parameters parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument("--input_file", type=str, help="Input file") parser.add_argument('--subset', type=int, default=0, help="Decode a subset of the input dataset.") parser.add_argument("--output_file", type=str, help="output file") parser.add_argument("--split", type=str, default="", help="Data split (train/val/test).") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Forbid the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=None, type=int) parser.add_argument('--need_score_traces', action='store_true') parser.add_argument('--ngram_size', type=int, default=3) parser.add_argument('--mode', default="s2s", choices=["s2s", "l2r", "both","2in1","a2q","q2a","c2a","c2q"]) parser.add_argument('--max_tgt_length', type=int, default=128, help="maximum length of target sequence") parser.add_argument('--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument('--s2s_share_segment', action='store_true', help="Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment).") parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() if args.need_score_traces and args.beam_size <= 1: raise ValueError( "Score trace is only available for beam search with beam size > 1.") if args.max_tgt_length >= args.max_seq_length - 2: raise ValueError("Maximum tgt length exceeds max seq length - 2.") device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) tokenizer.max_len = args.max_seq_length pair_num_relation = 0 bi_uni_pipeline = [] bi_uni_pipeline.append(seq2seq_loader.Preprocess4Seq2seqDecoder(list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, max_tgt_length=args.max_tgt_length, new_segment_ids=args.new_segment_ids, mode=args.mode, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 if args.mode == "a2q": mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[SEP_1]"]) elif args.mode == "q2a": mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[SEP_2]"]) elif args.mode == "c2q": mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[SEP_6]"]) elif args.mode == "c2a": mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[SEP_7]"]) elif args.mode == "2in1": mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[SEP_0]"]) else: mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"]) forbid_ignore_set = None if args.forbid_ignore_word: w_list = [] for w in args.forbid_ignore_word.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) forbid_ignore_set = set(tokenizer.convert_tokens_to_ids(w_list)) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained(args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=pair_num_relation, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, ngram_size=args.ngram_size, min_len=args.min_len, mode=args.mode, max_position_embeddings=args.max_seq_length, ffn_type=args.ffn_type, num_qkv=args.num_qkv, seg_emb=args.seg_emb, pos_shift=args.pos_shift) del model_recover if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) torch.cuda.empty_cache() model.eval() next_i = 0 max_src_length = args.max_seq_length - 2 - args.max_tgt_length with open(args.input_file, encoding="utf-8") as fin: input_lines = [x.strip() for x in fin.readlines()] if args.subset > 0: logger.info("Decoding subset: %d", args.subset) input_lines = input_lines[:args.subset] data_tokenizer = WhitespaceTokenizer() if args.tokenized_input else tokenizer input_lines = [data_tokenizer.tokenize( x)[-max_src_length:] for x in input_lines] input_lines = sorted(list(enumerate(input_lines)), key=lambda x: -len(x[1])) output_lines = [""] * len(input_lines) score_trace_list = [None] * len(input_lines) total_batch = math.ceil(len(input_lines) / args.batch_size) with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[1] for x in _chunk] next_i += args.batch_size max_a_len = max([len(x) for x in buf]) instances = [] for instance in [(x, max_a_len) for x in buf]: for proc in bi_uni_pipeline: instances.append(proc(instance)) with torch.no_grad(): batch = seq2seq_loader.batch_list_to_batch_tensors( instances) batch = [ t.to(device) if t is not None else None for t in batch] input_ids, token_type_ids, position_ids, input_mask, mask_qkv, task_idx = batch traces = model(input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx, mask_qkv=mask_qkv) if args.beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces['pred_seq'] else: output_ids = traces.tolist() for i in range(len(buf)): w_ids = output_ids[i] output_buf = tokenizer.convert_ids_to_tokens(w_ids) output_tokens = [] for t in output_buf: if t in ("[SEP]", "[PAD]","[S2S_SEP]"): pass break output_tokens.append(t) output_sequence = ' '.join(detokenize(output_tokens)) print(output_sequence) output_lines[buf_id[i]] = output_sequence if args.need_score_traces: score_trace_list[buf_id[i]] = { 'scores': traces['scores'][i], 'wids': traces['wids'][i], 'ptrs': traces['ptrs'][i]} pbar.update(1) if args.output_file: fn_out = args.output_file else: fn_out = model_recover_path+'.'+args.split with open(fn_out, "w", encoding="utf-8") as fout: for l in output_lines: fout.write(l) fout.write("\n") if args.need_score_traces: with open(fn_out + ".trace.pickle", "wb") as fout_trace: pickle.dump( {"version": 0.0, "num_samples": len(input_lines)}, fout_trace) for x in score_trace_list: pickle.dump(x, fout_trace)
def __init__(self, output_dir, model="best_model.bin", topk=50, bert_model="bert-large-cased", do_lower_case=False, train_batch_size=32, seed=42, eval_batch_size=64, max_seq_length=128, num_labels=2, entail_label=1, grad_acc_steps=1, num_of_epochs=5, learning_rate=1e-5, warmup_proportion=0.1, action="train"): print("Loading BERT NLI Model") self.name = "BertNLI" self.topk = topk device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) self.device = device self.tokenizer = tokenizer self.max_seq_length = max_seq_length self.train_batch_size = int(train_batch_size / grad_acc_steps) self.eval_batch_size = eval_batch_size self.num_labels = num_labels self.entail_label = entail_label self.grad_acc_steps = grad_acc_steps self.gradient_accumulation_steps = self.grad_acc_steps self.num_of_epochs = num_of_epochs self.output_dir = output_dir self.learning_rate = learning_rate self.warmup_proportion = warmup_proportion self.n_gpu = n_gpu print("Action:", action) if action == "train": print("Params:") print("Device:", device) print("Max-Seq-Length:", max_seq_length) print("Train-Batch-Size:", train_batch_size) print("Num Labels:", num_labels) print("Gradient Accumulation Steps:", grad_acc_steps) print("Num of Train Epochs:", num_of_epochs) print("Output Dir:", output_dir) print("Learning Rate:", learning_rate) print("Warmup Proportion:", warmup_proportion) print("Number of Gpus:", n_gpu) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) os.makedirs(self.output_dir, exist_ok=True) model = BertForSequenceClassification.from_pretrained( bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), num_labels=num_labels) model.to(device) model = torch.nn.DataParallel(model) self.model = model self.best_metric = None if action == "predict": output_model_file = os.path.join(output_dir, model) # Load a trained model that you have fine-tuned print("Loading Model", output_model_file) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) model = torch.nn.DataParallel(model) self.model = model
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(output_model_file)) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", type=str, required=True, help= "The input data dir. Should contain train.tsv and dev.tsv files for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", type=str, required=True, help="The name of the task to train (codah or swag).") parser.add_argument( "--output_dir", default="../bert-output/", type=str, required=False, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( '--model_labels_save_filename', type=str, default='model_labels.json', help= "JSON file to save model outputs/labels to (relative to output_dir)") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--answer_only", default=False, action='store_true', help="Whether to run with answers only (blank out question).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument( '--save_final_only', default=False, action='store_true', help= "Whether to only save the final model from training (vs saving at every epoch)" ) args = parser.parse_args() processors = { "codah": CodahProcessor, "swag": SwagProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) if args.answer_only: for exset in train_examples: for ex in exset: ex.text_a = '' num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=1) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare loss criterion criterion = CrossEntropyLoss() # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 # Enable logging for eval-only use case tr_loss = 0 nb_tr_steps = 1 if args.do_train: # train_features will now be list of lists(of length 4) train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) # all ids are lists of lists(of 4) all_input_ids = torch.tensor([[ff.input_ids for ff in f] for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([[ff.input_mask for ff in f] for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([[ff.segment_ids for ff in f] for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([[ff.label_id for ff in f] for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) # each minibatch that dataloader generates is a set of 4 answers for 1 question, each processed individually train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): # reshape and reduce the second dimension # pull label from training data, set aside for softmax n_batches = batch[0].shape[0] batch = [ids.view(ids.shape[0] * 4, -1) for ids in batch] batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model( input_ids, segment_ids, input_mask ) #, label_ids) label removed to skip softmax in model logits = logits.view(-1, 4) # reshape to (:, 4) # run 4-label softmax on each minibatch loss = criterion(logits, label_ids.view(n_batches, 4)[:, 0]) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if (epoch_num + 1 == int( args.num_train_epochs)) or not args.save_final_only: logger.info("***** Saving model to disk *****") torch.save( model.state_dict(), args.output_dir + "pytorch_model_epoch{}.bin".format(epoch_num)) logger.info("***** Finished saving model to disk *****") # Probably redundant, since BertAdam configurations remain unchanged throughout training, and params are # read directly from model # torch.save(optimizer.state_dict(), args.output_dir + "pytorch_optimizer_epoch{}.bin".format(_)) # logger.info("***** Finished saving optimizer to disk *****") if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_eval_result_file = open(os.path.join(args.output_dir, 'full-result.txt'), 'w+', encoding='utf8') eval_examples = processor.get_dev_examples(args.data_dir) if args.answer_only: for exset in eval_examples: for ex in exset: ex.text_a = '' eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([[ff.input_ids for ff in f] for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([[ff.input_mask for ff in f] for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([[ff.segment_ids for ff in f] for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([[ff.label_id for ff in f] for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) all_model_outputs = [] data_index = 0 model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.view(input_ids.shape[0] * 4, -1).to(device) input_mask = input_mask.view(input_mask.shape[0] * 4, -1).to(device) segment_ids = segment_ids.view(segment_ids.shape[0] * 4, -1).to(device) label_ids = label_ids[:, 0].to(device) with torch.no_grad(): # output label dimension == num_labels == 1 # reshape into (bsz, 4) logits = model(input_ids, segment_ids, input_mask).view(-1, 4) tmp_eval_loss = criterion(logits, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy_logging(logits, label_ids, output_eval_result_file) for i in range(logits.shape[0]): output_obj = dict() output_obj['logits'] = [float(x) for x in logits[i]] output_obj['true_label'] = int(label_ids[i]) output_obj['model_label'] = int(np.argmax(logits[i])) output_obj['swag_data'] = { 'context_sentence': eval_examples[data_index][0].text_a, 'answer_candidates': [ex.text_b for ex in eval_examples[data_index]] } all_model_outputs.append(output_obj) data_index += 1 eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += label_ids.shape[0] nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_result_file.close() output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) with open( os.path.join(args.output_dir, args.model_labels_save_filename), 'w') as f: json.dump(all_model_outputs, f)
import tqdm import pickle import re import collections import argparse from sys import path from data_utils.vocab import Vocabulary from pytorch_pretrained_bert.tokenization import BertTokenizer from data_utils.log_wrapper import create_logger from data_utils.label_map import GLOBAL_MAP from data_utils.glue_utils import * DEBUG_MODE = False MAX_SEQ_LEN = 512 bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') logger = create_logger(__name__, to_disk=True, log_file='bert_data_proc_512.log') def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length. Copyed from https://github.com/huggingface/pytorch-pretrained-BERT """ # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b)
norm_gcn_vocab_adj_list=[] for i in range(len(gcn_vocab_adj_list)): adj=gcn_vocab_adj_list[i] #.tocsr() #(lr是用非norm时的1/10) print(' Zero ratio(?>66%%) for vocab adj %dth: %.8f'%(i, 100*(1-adj.count_nonzero()/(adj.shape[0]*adj.shape[1])))) adj=normalize_adj(adj) norm_gcn_vocab_adj_list.append(sparse_scipy2torch(adj.tocoo()).to(device)) gcn_adj_list=norm_gcn_vocab_adj_list del gcn_vocab_adj_tf,gcn_vocab_adj,gcn_vocab_adj_list gc.collect() train_classes_num, train_classes_weight = get_class_count_and_weight(train_y,len(label2idx)) loss_weight=torch.tensor(train_classes_weight).to(device) tokenizer = BertTokenizer.from_pretrained(bert_model_scale, do_lower_case=do_lower_case) #%% def get_pytorch_dataloader(examples, tokenizer, batch_size, shuffle_choice, classes_weight=None, total_resample_size=-1): ds = CorpusDataset(examples, tokenizer, gcn_vocab_map, MAX_SEQ_LENGTH, gcn_embedding_dim) if shuffle_choice==0: # shuffle==False return DataLoader(dataset=ds, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=ds.pad) elif shuffle_choice==1: # shuffle==True return DataLoader(dataset=ds, batch_size=batch_size, shuffle=True,
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_path", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.bert_model) model = BertModel.from_pretrained(args.bert_model) model.to(DEVICE) # read dataframe recipe_df = pd.read_csv(args.data_path) recipe_df['pred_steps'] = recipe_df['pred_steps'].agg(eval) bert_features_for_all_recipe = [] for index, recipe in tqdm(recipe_df.iterrows(), total=len(recipe_df)): if len(recipe['pred_steps']) > 15: steps = recipe['pred_steps'][:15] else: steps = recipe['pred_steps'] recipe_bert_like_input = bert_input_prepare(steps) recipe_bert_features = get_bert_hidden(recipe_bert_like_input, max_sent_length=256, tokenizer=tokenizer, model=model, device=DEVICE, max_recipe_steps=15) bert_features_for_all_recipe.append(recipe_bert_features) recipe_df['bert_features_steps'] = bert_features_for_all_recipe recipe_df[['i', 'bert_features_steps']].to_msgpack( os.path.join(args.output_dir, 'absolute_order_model_baseline_e9_pred.msgpack'))
def main(args): # hyper param do_lower_case = args.do_lower_case root = args.root_dir assert os.path.exists(root) is_uncased = False if 'uncased' in args.model: is_uncased = True mt_dnn_suffix = 'bert' encoder_model = EncoderModelType.BERT if 'xlnet' in args.model: encoder_model = EncoderModelType.XLNET mt_dnn_suffix = 'xlnet' if 'roberta' in args.model: encoder_model = EncoderModelType.ROBERTA mt_dnn_suffix = 'roberta' if encoder_model == EncoderModelType.ROBERTA: if args.roberta_path is None or (not os.path.exists( args.roberta_path)): print('Please specify roberta model path') encoder = get_encoder('{}/encoder.json'.format(args.roberta_path), '{}/vocab.bpe'.format(args.roberta_path)) vocab = load_dict('{}/ict.txt'.format(args.roberta_path)) tokenizer = RoBERTaTokenizer(vocab, encoder) elif encoder_model == EncoderModelType.XLNET: tokenizer = spm.SentencePieceProcessor() if 'large' in args.model: tokenizer.load('mt_dnn_models/xlnet_large_cased_spiece.model') else: tokenizer.load('mt_dnn_models/xlnet_base_cased_spiece.model') else: tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=do_lower_case) if is_uncased: mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix) else: mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix) if do_lower_case: mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) task_def_dic = yaml.safe_load(open(args.task_def)) for task, task_def in task_def_dic.items(): logger.info("Task %s" % task) data_format = DataFormat[task_def["data_format"]] task_type = TaskType[task_def["task_type"]] label_mapper = task_defs.global_map.get(task, None) split_names = task_def.get("split_names", ["train", "dev", "test"]) for split_name in split_names: rows = load_data( os.path.join(root, "%s_%s.tsv" % (task, split_name)), data_format, task_type, label_mapper) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, data_format, encoderModelType=encoder_model, task_type=task_type, lab_dict=label_mapper)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on file.__next__ # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.do_train: assert (args.train_file is not None) and (args.dev_file is not None) if args.eval_test: assert args.test_file is not None else: assert args.dev_file is not None if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) # read query templates query_templates = read_query_templates(normal_file=args.normal_file, des_file=args.des_file) if args.do_train or (not args.eval_test): eval_examples = read_ace_examples(input_file=args.dev_file, is_training=False) gold_examples = read_ace_examples(input_file=args.gold_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, query_templates=query_templates, nth_query=args.nth_query, is_training=False) logger.info("***** Dev *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_if_trigger_ids = torch.tensor( [f.if_trigger_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_if_trigger_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if args.do_train: train_examples = read_ace_examples(input_file=args.train_file, is_training=True) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, query_templates=query_templates, nth_query=args.nth_query, is_training=True) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_if_trigger_ids = torch.tensor( [f.if_trigger_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_if_trigger_ids, all_start_positions, all_end_positions) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Train *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) eval_step = max(1, len(train_batches) // args.eval_per_epoch) best_result = None lrs = [args.learning_rate] if args.learning_rate else \ [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5] for lr in lrs: if not args.add_if_trigger_embedding: model = BertForQuestionAnswering.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) else: model = BertForQuestionAnswering_withIfTriggerEmbedding.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) param_optimizer = [ n for n in param_optimizer if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 global_step = 0 start_time = time.time() for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate(train_batches): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, if_trigger_ids, start_positions, end_positions = batch if not args.add_if_trigger_embedding: loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) else: loss = model(input_ids, segment_ids, if_trigger_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: save_model = False if args.do_eval: # result, _, _ = evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) result, preds = evaluate(args, model, device, eval_dataloader, eval_examples, gold_examples, eval_features) # import ipdb; ipdb.set_trace() model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) # logger.info("!!! Best dev %s (lr=%s, epoch=%d): p_c: %.2f, r_c: %.2f, f1_c: %.2f" % # (args.eval_metric, str(lr), epoch, result["prec_c"], result["recall_c"], result["f1_c"])) logger.info( "!!! Best dev %s (lr=%s, epoch=%d): p_c: %.2f, r_c: %.2f, f1_c: %.2f, p_i: %.2f, r_i: %.2f, f1_i: %.2f" % (args.eval_metric, str(lr), epoch, result["prec_c"], result["recall_c"], result["f1_c"], result["prec_i"], result["recall_i"], result["f1_i"])) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: with open( os.path.join(args.output_dir, "eval_results.txt"), "w") as writer: for key in sorted(best_result.keys()): writer.write( "%s = %s\n" % (key, str(best_result[key]))) if args.do_eval: if args.eval_test: eval_examples = read_ace_examples(input_file=args.test_file, is_training=False) gold_examples = read_ace_examples(input_file=args.gold_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, query_templates=query_templates, nth_query=args.nth_query, is_training=False) logger.info("***** Test *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_if_trigger_ids = torch.tensor( [f.if_trigger_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_if_trigger_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if not args.add_if_trigger_embedding: model = BertForQuestionAnswering.from_pretrained(args.output_dir) else: model = BertForQuestionAnswering_withIfTriggerEmbedding.from_pretrained( args.output_dir) if args.fp16: model.half() model.to(device) result, preds = evaluate(args, model, device, eval_dataloader, eval_examples, gold_examples, eval_features) with open(os.path.join(args.output_dir, "test_results.txt"), "w") as writer: for key in result: writer.write("%s = %s\n" % (key, str(result[key]))) with open(os.path.join(args.output_dir, "arg_predictions.json"), "w") as writer: for key in preds: writer.write(json.dumps(preds[key], default=int) + "\n")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_file", default="data/conceptual_caption/training", type=str, # required=True, help="The input train corpus.", ) parser.add_argument( "--validation_file", default="data/conceptual_caption/validation", type=str, # required=True, help="The input train corpus.", ) parser.add_argument( "--from_pretrained", default="", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, # required=True, help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--predict_feature", action="store_true", help="visual target.") parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action="store_true", help="Whether to load train samples into memory or use disk", ) parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=3, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument("--baseline", action="store_true", help="Wheter to use the baseline model (single bert).") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.") parser.add_argument("--use_chuncks", default=0, type=float, help="whether use chunck for parallel training.") parser.add_argument("--distributed", action="store_true", help="whether use chunck for parallel training.") parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") args = parser.parse_args() if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BertForMultiModalPreTraining else: from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig print(args) if args.save_name is not '': timeStamp = args.save_name else: timeStamp = strftime("%d-%b-%y-%X-%a", gmtime()) timeStamp += "_{:0>6d}".format(random.randint(0, 10e6)) savePath = os.path.join(args.output_dir, timeStamp) if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) bert_weight_name = json.load( open("config/" + args.from_pretrained + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None viz = TBlogger("logs", timeStamp) train_dataset = ConceptCapLoaderTrain( args.train_file, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=args.num_workers, distributed=args.distributed, ) validation_dataset = ConceptCapLoaderVal( args.validation_file, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=2, distributed=args.distributed, ) num_train_optimization_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch)) # if args.local_rank != -1: # num_train_optimization_steps = ( # num_train_optimization_steps // torch.distributed.get_world_size() # ) default_gpu = False if dist.is_available() and args.distributed: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True # pdb.set_trace() if args.predict_feature: config.v_target_size = 2048 config.predict_feature = True else: config.v_target_size = 1601 config.predict_feature = False if args.from_pretrained: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = BertForMultiModalPreTraining(config) model.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if args.from_pretrained: optimizer = BertAdam( optimizer_grouped_parameters, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) startIterID = 0 global_step = 0 masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 start_t = timer() # t1 = timer() for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # iter_dataloader = iter(train_dataloader) for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) # batch = iter_dataloader.next() batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) if args.without_coattention: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if math.isnan(loss.item()): pdb.set_trace() tr_loss += loss.item() rank = 0 if dist.is_available() and args.distributed: rank = dist.get_rank() else: rank = 0 viz.linePlot(iterId, loss.item(), "loss_" + str(rank), "train") viz.linePlot(iterId, masked_loss_t.item(), "masked_loss_t_" + str(rank), "train") viz.linePlot(iterId, masked_loss_v.item(), "masked_loss_v_" + str(rank), "train") viz.linePlot(iterId, next_sentence_loss.item(), "next_sentence_loss_" + str(rank), "train") # viz.linePlot(iterId, optimizer.get_lr()[0], 'learning_rate', 'train') loss_tmp += loss.item() masked_loss_v_tmp += masked_loss_v.item() masked_loss_t_tmp += masked_loss_t.item() next_sentence_loss_tmp += next_sentence_loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if step % 20 == 0 and step != 0: masked_loss_t_tmp = masked_loss_t_tmp / 20.0 masked_loss_v_tmp = masked_loss_v_tmp / 20.0 next_sentence_loss_tmp = next_sentence_loss_tmp / 20.0 loss_tmp = loss_tmp / 20.0 end_t = timer() timeStamp = strftime("%a %d %b %y %X", gmtime()) Ep = epochId + nb_tr_steps / float(len(train_dataset)) printFormat = "[%s][Ep: %.2f][Iter: %d][Time: %5.2fs][Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g][LR: %.8g]" printInfo = [ timeStamp, Ep, nb_tr_steps, end_t - start_t, loss_tmp, masked_loss_v_tmp, masked_loss_t_tmp, next_sentence_loss_tmp, optimizer.get_lr()[0], ] start_t = end_t print(printFormat % tuple(printInfo)) masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 # Do the evaluation torch.set_grad_enabled(False) start_t = timer() numBatches = len(validation_dataset) eval_masked_loss_t = 0 eval_masked_loss_v = 0 eval_next_sentence_loss = 0 eval_total_loss = 0 model.eval() for step, batch in enumerate(validation_dataset): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() eval_masked_loss_t += masked_loss_t.item() eval_masked_loss_v += masked_loss_v.item() eval_next_sentence_loss += next_sentence_loss.item() eval_total_loss += loss.item() end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % ('val', step + 1, numBatches)) sys.stdout.flush() eval_masked_loss_t = eval_masked_loss_t / float(numBatches) eval_masked_loss_v = eval_masked_loss_v / float(numBatches) eval_next_sentence_loss = eval_next_sentence_loss / float(numBatches) eval_total_loss = eval_total_loss / float(numBatches) printFormat = "Evaluation: [Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g]" printInfo = [ eval_total_loss, eval_masked_loss_v, eval_masked_loss_t, eval_next_sentence_loss ] print(printFormat % tuple(printInfo)) torch.set_grad_enabled(True) viz.linePlot(epochId, eval_total_loss, "loss_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_t, "masked_loss_t_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_v, "masked_loss_v_" + str(rank), "val") viz.linePlot(epochId, eval_next_sentence_loss, "next_sentence_loss_" + str(rank), "val") if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file)
input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels.to_numpy()) dataset = TensorDataset(input_ids, attention_masks, labels) with open( f'{get_current_project_directory()}/../poem_clf_dataset/dataset.pkl', 'wb') as f: pickle.dump(dataset, f) return dataset if __name__ == "__main__": print('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) dataAll = load_data() dataAll.Poem = preprocess_poem(dataAll.Poem) filtered_category = preprocess_tag(dataAll) # Get top tags top_category = list(filtered_category.keys()) top_category = [ 'Activities', 'Arts & Sciences', 'Living', 'Love', 'Nature', 'Relationships', 'Religion', 'Social Commentaries' ] print(top_category) dataAll.drop(columns=["Unnamed: 0", "Title", 'Poet'], inplace=True) dataAll.dropna(inplace=True)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--config_dir', type=str, default='', help="Config file for Bert") add_quantize_arguments(parser) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels, config_dir=args.config_dir) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--question", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph question_file = args.question model_path = args.model device = torch.device("cuda") ### Raeding paragraph # f = open(para_file, 'r') # para = f.read() # f.close() ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() # para_list = para.split('\n\n') f = open(para_file, "rb") para = f.read() para = para.decode('windows-1252') para = para.strip("\n").replace("\r", " ").replace("\n", "") #print(para) # print(para) f.close() f_ = open(question_file, "r") question = f_.read() question = question.split("\n") while "" in question: question.remove("") for q in question: q = q.strip("\n") f_.close() input_data = [] pfinder = ParaFinder(para) i = 0 for q in question: closest_para = pfinder.closestParagraph(q) paragraphs = {} paragraphs["id"] = i paragraphs["text"] = closest_para paragraphs["ques"] = [q] i += 1 input_data.append(paragraphs) # print(input_data) ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in pred_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) ### For printing the results #### index = None for example in examples: if index != example.example_id: # print(example.para_text) index = example.example_id # print('\n') # print(colored('***********Question and Answers *************', 'red')) ques_text = example.question_text print(ques_text) prediction, prob = predictions[math.floor(example.unique_id / 12)][example] if prob > 0.35: print(prediction) #print(type(prediction)) else: print("No result found")
def main(): parser = argparse.ArgumentParser() # General parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--bert_model", default="bert-base-cased", type=str, help= "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased" ) parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument('--max_position_embeddings', type=int, default=512, help="max position embeddings") # For decoding parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Forbid the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=None, type=int) parser.add_argument('--ngram_size', type=int, default=3) parser.add_argument('--max_tgt_length', type=int, default=20, help="maximum length of target sequence") # Others for VLP parser.add_argument("--src_file", default='/mnt/dat/COCO/annotations/dataset_coco.json', type=str, help="The input data file name.") parser.add_argument('--dataset', default='coco', type=str, help='coco | flickr30k | cc') parser.add_argument('--len_vis_input', type=int, default=100) # parser.add_argument('--resnet_model', type=str, default='imagenet_weights/resnet101.pth') parser.add_argument('--image_root', type=str, default='/mnt/dat/COCO/images') parser.add_argument('--split', type=str, default='val') parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--enable_butd', action='store_true', help='set to take in region features') parser.add_argument( '--region_bbox_file', default= 'coco_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str) parser.add_argument( '--region_det_file_prefix', default= 'feat_cls_1000/coco_detection_vg_100dets_gvd_checkpoint_trainval', type=str) parser.add_argument('--file_valid_jpgs', default='', type=str) args = parser.parse_args() if args.enable_butd: assert (args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join( args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) args.max_seq_length = args.max_tgt_length + args.len_vis_input + 3 # +3 for 2x[SEP] and [CLS] tokenizer.max_len = args.max_seq_length bi_uni_pipeline = [] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seqDecoder( list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, max_tgt_length=args.max_tgt_length, new_segment_ids=args.new_segment_ids, mode='s2s', len_vis_input=args.len_vis_input, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix)) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 mask_word_id, eos_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]"]) forbid_ignore_set = None if args.forbid_ignore_word: w_list = [] for w in args.forbid_ignore_word.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) forbid_ignore_set = set(tokenizer.convert_tokens_to_ids(w_list)) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, max_position_embeddings=args.max_position_embeddings, config_path=args.config_path, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, ngram_size=args.ngram_size, min_len=args.min_len, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input) del model_recover # from vlp.resnet import resnet # cnn = resnet(args.resnet_model, _num_layers=101, _fixed_block=4, pretrained=True) # no finetuning if args.fp16: model.half() # cnn.half() model.to(device) # cnn.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # cnn = torch.nn.DataParallel(cnn) torch.cuda.empty_cache() model.eval() # cnn.eval() eval_lst = [] #pdb.set_trace() gpv_image_ids = json.load( open( '/home/amitak/data/learning_phase_data/coco_captions/gpv_split/image_ids_val.json', 'r'))[:100] gpv_image_ids = {imid: 1 for imid in gpv_image_ids} img_to_cap = {} with open(args.src_file, "r", encoding='utf-8') as f_src: img_dat = json.load(f_src)['images'] img_idx = 0 valid_jpgs = None #if (args.file_valid_jpgs == '' or args.dataset in \ # ('coco', 'flickr30k')) else json.load(open(args.file_valid_jpgs)) # You're just reading the image, not each caption for src in img_dat: #if src['split'] == args.split and (valid_jpgs is None or src['filename'] in valid_jpgs): image_id = int(src['filename'].split('_')[2][:-4]) if image_id in gpv_image_ids: img_to_cap[image_id] = src['sentids'] if args.enable_butd: src_tk = os.path.join(args.image_root, src.get('filepath', 'trainval'), src['filename'][:-4] + '.npy') else: src_tk = os.path.join(args.image_root, src.get('filepath', 'trainval'), src['filename']) if args.dataset == 'coco': imgid = int(src['filename'].split('_')[2][:-4]) elif args.dataset == 'cc': imgid = int(src['imgid']) elif args.dataset == 'flickr30k': imgid = int(src['filename'].split('.')[0]) eval_lst.append( (img_idx, imgid, src_tk)) # id and path for COCO img_idx += 1 input_lines = eval_lst next_i = 0 output_lines = [""] * len(input_lines) total_batch = math.ceil(len(input_lines) / args.batch_size) print('start the caption evaluation...') with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[2] for x in _chunk] next_i += args.batch_size instances = [] for instance in [(x, args.len_vis_input) for x in buf]: for proc in bi_uni_pipeline: instances.append(proc(instance)) with torch.no_grad(): batch = batch_list_to_batch_tensors(instances) batch = [t.to(device) for t in batch] input_ids, token_type_ids, position_ids, input_mask, task_idx, img, vis_pe = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() if args.enable_butd: conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data else: conv_feats, _ = cnn(img.data) # Bx2048x7x7 conv_feats = conv_feats.view(conv_feats.size(0), conv_feats.size(1), -1).permute( 0, 2, 1).contiguous() traces = model(conv_feats, vis_pe, input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx) if args.beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces['pred_seq'] else: output_ids = traces[0].tolist() for i in range(len(buf)): w_ids = output_ids[i] output_buf = tokenizer.convert_ids_to_tokens(w_ids) output_tokens = [] for t in output_buf: if t in ("[SEP]", "[PAD]"): break output_tokens.append(t) output_sequence = ' '.join(detokenize(output_tokens)) output_lines[buf_id[i]] = output_sequence pbar.update(1) # This is all I need: pdb.set_trace() predictions = [{ 'image_id': tup[1], 'caption': output_lines[img_idx] } for img_idx, tup in enumerate(input_lines)] gpv_caption_ids = json.load( open( '/home/amitak/data/learning_phase_data/coco_captions/gpv_split/cap_ids_val.json', 'r')) final_predictions = { c: { 'answer': p['caption'] } for p in predictions for c in img_to_cap[p['image_id']] if c in gpv_caption_ids } json.dump(final_predictions, open('vlp_preds.json', 'w')) """ new_predictions = [] for p in predictions: num_caps = len([c for c in img_to_cap[p['image_id']] if c in gpv_caption_ids]) new_predictions.extend([p]*num_caps) """ lang_stats = language_eval( args.dataset, predictions, args.model_recover_path.split('/')[-2] + '-' + args.split + '-' + args.model_recover_path.split('/')[-1].split('.')[-2], args.split)
device = torch.device('cuda:0') sep = ' [SEP] ' samples = [] with open('../samples/QLP/SampleData.tsv', 'r', encoding='utf8') as f: for line in f: line = line.strip() index, title1, title2, description, _, _, url, doc = line.split( '\t') samples.append( [index, title1 + sep + title2 + sep + description, url, doc]) # json_dump(samples, '../samples/QLP/ValidationData_after.json') # samples = json_load('../samples/QLP/ValidationData_after.json') epoch_output_dir = '/home/work/waka/projects/pytorch-pretrained-BERT/samples/QLP/result/base/fullAd/Random/2parts/epoch2' model = BertForSequenceClassification.from_pretrained( epoch_output_dir, num_labels=2).to(device) tokenizer = BertTokenizer.from_pretrained(epoch_output_dir, do_lower_case=True) def pre_in_batch_eval(raw): index = [] title = [] url = [] doc = [] input_ids = [] input_mask = [] segment_ids = [] for sample in raw: index.append(sample[0]) title.append(sample[1]) url.append(sample[2]) doc.append(sample[3]) one_input_ids, one_input_mask, one_segment_ids = preprocess(
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "temporal": TemporalProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 total_prints = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy, prints = accuracy(logits, label_ids) total_prints += prints eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") output_print_file = os.path.join(args.output_dir, "eval_outputs.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) with open(output_print_file, "w") as writer: for l in total_prints: writer.write(l + "\n")
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step try: optimizer.step() optimizer.zero_grad() global_step += 1 except: pass # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = args.output_dir / "pytorch_model.bin" torch.save(model_to_save.state_dict(), str(output_model_file))
def main(args): # set up logging and device args.save_dir = utils.get_save_dir(args.save_dir, args.name, training=True) logger = utils.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Generating the dictionaries dep_dict, pos_dict, ent_dict, total_features = generate_dictionary( args.train_ling_features_file, args.eval_ling_features_file, args.test_ling_features_file) # from IPython import embed; embed() # Generating total_dictionary total_dict = convert_string_features_to_array(total_features, dep_dict, pos_dict, ent_dict) # from IPython import embed; embed() train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative, total_dictionary=total_dict) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForQuestionAnsweringLing.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizer import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 # load training features cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None print(cached_train_features_file) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) # load eval features eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative, total_dictionary=total_dict) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) test_examples = read_squad_examples( input_file=args.test_file, is_training=False, version_2_with_negative=args.version_2_with_negative, total_dictionary=total_dict) test_features = convert_examples_to_features( examples=test_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) if args.do_train: logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) # from IPython import embed; embed() all_ling_features = torch.tensor( [f.ling_features for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ling_features, all_start_positions, all_end_positions) steps_till_eval = args.eval_steps if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() best_F1 = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, ling_features, start_positions, end_positions = batch # from IPython import embed; embed() loss = model(input_ids, segment_ids, input_mask, ling_features, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # add to tensorboard loss_val = loss.item() tbx.add_scalar('train/NLL', loss_val, global_step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], global_step) steps_till_eval -= args.train_batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint logger.info('Evaluating at step {}...'.format(step)) # ema.assign(model) results, _ = evaluate(model, eval_examples, eval_features, device, args, logger, args.version_2_with_negative, args.dev_eval_file) # saver.save(step, model, results[args.metric_name], device) # ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) logger.info('Dev {}'.format(results_str)) # Log to TensorBoard logger.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, global_step) """ util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) """ if results['F1'] > best_F1: best_F1 = results['F1'] model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_best.bin") torch.save(model_to_save.state_dict(), output_model_file) #model.to(device) # Save a trained model """ model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnsweringLing.from_pretrained(args.bert_model, state_dict=model_state_dict) else: model = BertForQuestionAnsweringLing.from_pretrained(args.bert_model) model.to(device) """ # load the best trained model and eval on the eval set and test set best_model_file = os.path.join(args.output_dir, "pytorch_model_best.bin") model_state_dict = torch.load(best_model_file) model = BertForQuestionAnsweringLing.from_pretrained( args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info('Evaluating at the best model') results, all_results = evaluate(model, eval_examples, eval_features, device, args, logger, args.version_2_with_negative, args.dev_eval_file) logger.info('Write the best eval results') output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, 'dev') logger.info('Test set at the best model') results, all_results = evaluate(model, test_examples, test_features, device, args, logger, args.version_2_with_negative, args.test_eval_file) logger.info('Write the best test set results') output_prediction_file = os.path.join(args.output_dir, "predictions_test.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_test.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_test.json") write_predictions(test_examples, test_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold, 'test') """
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--eval_file", default=None, type=str, required=True, help="The input eval corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_file", default=None, type=str, required=True, help="The output file where the scores will be written.") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', default=True, help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--test_run", action='store_true', default=False, help="If true, shortcut the input data.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) # Load eval_data eval_dataset_answerable = BERTDataset(args.eval_file, tokenizer, seq_len=args.max_seq_length, on_memory=args.on_memory, answerable=True) eval_dataset_unanswerable = BERTDataset(args.eval_file, tokenizer, seq_len=args.max_seq_length, on_memory=args.on_memory, answerable=False) # Prepare model if n_gpu > 0: model_state_dict = torch.load(args.bert_model) else: model_state_dict = torch.load(args.bert_model, map_location='cpu') context_model = BertModel.from_pretrained( "bert-base-uncased") #, state_dict=model_state_dict) question_model = BertModel.from_pretrained( "bert-base-uncased") #, state_dict=model_state_dict) context_model.to(device) question_model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) context_model = DDP(context_model) question_model = DDP(question_model) elif n_gpu > 1: context_model = torch.nn.DataParallel(context_model) question_model = torch.nn.DataParallel(question_model) # Prepare optimizer print("Checking the vocab size:", len(tokenizer.vocab)) # 768 is bert hidden size, 256 is GRU hidden size, 1 is the layers in the GRU model = RNNModel("GRU", len(tokenizer.vocab), 768, 768, 1, context_model, question_model, ngpu=n_gpu) model.load_state_dict(model_state_dict) model.to(device) # eval loader eval_sampler_ans = SequentialSampler(eval_dataset_answerable) eval_dataloader_ans = DataLoader(eval_dataset_answerable, sampler=eval_sampler_ans, batch_size=args.train_batch_size) eval_sampler_unans = SequentialSampler(eval_dataset_unanswerable) eval_dataloader_unans = DataLoader(eval_dataset_unanswerable, sampler=eval_sampler_unans, batch_size=args.train_batch_size) criterion = nn.CrossEntropyLoss() model.init_hidden(args.train_batch_size) with torch.no_grad(): model.eval() with open(args.output_file, "w") as handle: loss_writer = csv.writer(handle, delimiter=',') eval_loss_ans = 0 for batch_i, eval_batch in enumerate(eval_dataloader_ans): if batch_i % 1000 == 0: print("#### DANITER completed answerable", batch_i) eids = eval_batch[-1] eval_batch = tuple(t.to(device) for t in eval_batch[:-1]) question_ids, question_mask, context_ids, context_mask, targets = eval_batch output, _ = model(context_ids, context_mask, question_ids, question_mask) loss = criterion(output.view(-1, len(tokenizer.vocab)), question_ids.view(-1)) loss_writer.writerow([eids[0], loss.item(), "ANS"]) eval_loss_ans += loss.item() print("##### DANITER EVAL LOSS IS (ANSWERABLE) : ", eval_loss_ans) eval_loss_unans = 0 for batch_i, eval_batch in enumerate(eval_dataloader_unans): if batch_i % 1000 == 0: print("#### DANITER completed unanswerable", batch_i) eids = eval_batch[-1] eval_batch = tuple(t.to(device) for t in eval_batch[:-1]) question_ids, question_mask, context_ids, context_mask, targets = eval_batch output, _ = model(context_ids, context_mask, question_ids, question_mask) loss = criterion(output.view(-1, len(tokenizer.vocab)), question_ids.view(-1)) loss_writer.writerow([eids[0], loss.item(), "UNANS"]) eval_loss_unans += loss.item() print("##### DANITER EVAL LOSS IS (UNANSWERABLE) : ", eval_loss_unans)
if len(args.train_from) > 0: print("loading model from", args.train_from) saved_stuff = torch.load(args.train_from) saved_args = saved_stuff["opt"] model = TagWordModel(args) model.load_state_dict(saved_stuff["state_dict"]) else: model = TagWordModel(args) model = model.to(device) nosplits = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]") if args.nosplit_parenth: nosplits = nosplits + ("-LPR-", "-RPR-") tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.lower, never_split=nosplits, cache_dir=CACHEDIR) print("make sure you agree with never_splits!") if args.load_saved_db and args.db_fi is not None: print("loading db from", args.db_fi) sentdb = data.SentDB(None, None, None, path=args.db_fi, align_strat=args.align_strat, subsample_all=args.subsample_all) else: sentdb = data.SentDB(args.sent_fi, args.tag_fi, tokenizer, args.val_sent_fi, args.val_tag_fi, lower=args.lower, align_strat=args.align_strat, subsample_all=args.subsample_all) nebert = model.bert if args.zero_shot and "newne" not in args.just_eval: nebert = BertModel.from_pretrained(args.bert_model, cache_dir=CACHEDIR)
def main(): parser = get_argument_parser() deepspeed.init_distributed(dist_backend='nccl') # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForQuestionAnswering.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) # Support for word embedding padding checkpoints # Prepare model bert_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "hidden_act": "gelu", "hidden_dropout_prob": args.dropout, "attention_probs_dropout_prob": args.dropout, "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.ckpt_type == "DS": if args.preln: bert_config = BertConfigPreLN(**bert_model_config) else: bert_config = BertConfig(**bert_model_config) else: # Models from Tensorflow and Huggingface are post-LN. if args.preln: raise ValueError("Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm.") # Use the original bert config if want to load from non-DeepSpeed checkpoint. if args.origin_bert_config_file is None: raise ValueError("--origin_bert_config_file is required for loading non-DeepSpeed checkpoint.") bert_config = BertConfig.from_json_file(args.origin_bert_config_file) if bert_config.vocab_size != len(tokenizer.vocab): raise ValueError("vocab size from original checkpoint mismatch.") bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: vocab_diff = 8 - (bert_config.vocab_size % 8) bert_config.vocab_size += vocab_diff if args.preln: model = BertForQuestionAnsweringPreLN(bert_config, args) else: model = BertForQuestionAnswering(bert_config, args) print("VOCAB SIZE:", bert_config.vocab_size) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") if args.ckpt_type == "DS": checkpoint_state_dict = torch.load(args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict(checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") else: from convert_bert_ckpt_to_deepspeed import convert_ckpt_to_deepspeed convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file, vocab_diff, args.deepspeed_transformer_kernel) logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 },{ 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=optimizer_grouped_parameters, dist_init_required=True) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs #torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: os.makedirs(args.output_dir, exist_ok=True) # Prepare Summary writer if torch.distributed.get_rank() == 0 and args.job_name is not None: args.summary_writer = get_summary_writer(name=args.job_name, base=args.output_dir) else: args.summary_writer = None logger.info("propagate deepspeed-config settings to client settings") args.train_batch_size = model.train_micro_batch_size_per_gpu() args.gradient_accumulation_steps = model.gradient_accumulation_steps() args.fp16 = model.fp16_enabled() args.print_steps = model.steps_per_print() args.learning_rate = model.get_lr()[0] args.wall_clock_breakdown = model.wall_clock_breakdown() t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() ema_loss = 0. sample_count = 0 num_epoch = 0 all_step_time = 0.0 ave_rounds = 20 for _ in trange(int(args.num_train_epochs), desc="Epoch"): num_epoch += 1 epoch_step = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", smoothing=0)): start_time = time.time() bs_size = batch[0].size()[0] if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps ema_loss = args.loss_plot_alpha * ema_loss + ( 1 - args.loss_plot_alpha) * loss.item() model.backward(loss) loss_item = loss.item() * args.gradient_accumulation_steps loss = None sample_count += (args.train_batch_size * torch.distributed.get_world_size()) if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step model.step() global_step += 1 epoch_step += 1 if torch.distributed.get_rank( ) == 0 and args.summary_writer: summary_events = [ (f'Train/Steps/lr', lr_this_step, global_step), (f'Train/Samples/train_loss', loss_item, sample_count), (f'Train/Samples/lr', lr_this_step, sample_count), (f'Train/Samples/train_ema_loss', ema_loss, sample_count) ] if args.fp16 and hasattr(optimizer, 'cur_scale'): summary_events.append( (f'Train/Samples/scale', optimizer.cur_scale, sample_count)) write_summary_events(args.summary_writer, summary_events) args.summary_writer.flush() if torch.distributed.get_rank() == 0 and ( step + 1) % args.print_steps == 0: logger.info( f"bert_squad_progress: step={global_step} lr={lr_this_step} loss={ema_loss}" ) else: model.step() if is_time_to_exit(args=args, epoch_steps=epoch_step, global_steps=global_step): logger.info( f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}' ) break one_step_time = time.time() -start_time all_step_time += one_step_time if (step + 1)%(ave_rounds) == 0 and torch.distributed.get_rank() == 0: print('At Step {}, Averaged Throughput for {} rounds is: {} Samples/s'.format(step, ave_rounds, bs_size * ave_rounds * torch.distributed.get_world_size() / all_step_time ), flush=True ) all_step_time = 0.0 # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned #model_state_dict = torch.load(output_model_file) #model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) # model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--saved_model_path", default=None, type=str, help="Saved model path") parser.add_argument("--model_architecture", default="basic", type=str, help="What model architecture to use on top of BERT") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--eval_every", default=1000, type=int, help="evaluate at every eval_every number of steps ") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--weight_loss', default=False, action='store_true', help="Whether to weight class losses") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument("--metric", default=None, type=str, help="Which metric to use.") parser.add_argument( "--model_type", default='ch', type=str, help="en, ch or en-ch depending on which language(s) to be used") args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "wsdm": WSDMFakeNewsProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.do_train and os.path.exists(args.output_dir) and os.listdir( args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() bert_model_ch = 'bert-base-chinese' bert_model_en = 'bert-base-uncased' en_tokenizer = BertTokenizer.from_pretrained(bert_model_en) ch_tokenizer = BertTokenizer.from_pretrained(bert_model_ch) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.model_type == 'en': bert_model = bert_model_en elif args.model_type == 'ch': bert_model = bert_model_ch if args.model_type == 'ch-en': en_model = BertForSequenceClassification.from_pretrained( bert_model_en, len(label_list)) ch_model = BertForSequenceClassification.from_pretrained( bert_model_ch, len(label_list)) model = BertDualForWSDMFakeNews(ch_model, en_model, len(label_list)) else: if args.model_architecture == "basic": model = BertForSequenceClassification.from_pretrained( args.bert_model, len(label_list)) elif args.model_architecture == "wordpooling": model = BertForSequenceClassificationWithPooledWordEmbeddings.from_pretrained( args.bert_model, len(label_list)) else: raise NotImplementedError if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) metadata = {} global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_data = create_dataset_from_examples(train_examples, label_list, en_tokenizer, ch_tokenizer, args) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) weight_loss = args.weight_loss model.train() global_nb_tr_steps = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) guids, \ en_input_ids, en_input_mask, en_segment_ids, en_label_ids, \ ch_input_ids, ch_input_mask, ch_segment_ids, ch_label_ids = batch if args.model_type == 'ch': loss, _ = model(ch_input_ids, ch_segment_ids, ch_input_mask, ch_label_ids, weight_loss) elif args.model_type == 'en': loss, _ = model(en_input_ids, en_segment_ids, en_input_mask, en_label_ids, weight_loss) if args.model_type == 'ch-en': loss, _ = model(ch_input_ids, ch_segment_ids, ch_input_mask, ch_label_ids, en_input_ids, en_segment_ids, en_input_mask, en_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += ch_input_ids.size(0) nb_tr_steps += 1 global_nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 metadata['global_step'] = global_step, metadata['tr_loss'] = tr_loss metadata['global_nb_tr_steps'] = global_nb_tr_steps metadata['nb_tr_steps'] = nb_tr_steps if nb_tr_steps % args.eval_every == 0: evaluate(args, model, processor, en_tokenizer, ch_tokenizer, metadata, device) evaluate(args, model, processor, en_tokenizer, ch_tokenizer, metadata, device, mode="test") if not args.do_train and args.do_eval: model = torch.load(args.saved_model_path) evaluate(args, model, processor, en_tokenizer, ch_tokenizer, metadata, device) if not args.do_train and args.do_test: model = torch.load(args.saved_model_path) evaluate(args, model, processor, en_tokenizer, ch_tokenizer, metadata, device, mode="test")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "arg": ArgProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, "arg": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) model_state_dict = torch.load('./models/pytorch_model.bin') cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) #model.load_state_dict(torch.load('./models/pytorch_model1.bin')) # cache_dir=cache_dir, if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) pred, prob = [], [] gold = [] if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) for a, b in zip(logits, label_ids): pred.append(np.argmax(a)) gold.append(b) #prob.append(a) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } print(classification_report(gold, pred)) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: f = open('predictions.txt', 'w') for line1 in pred: f.write(str(line1) + '\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "aus" : OOCLAUSProcessor, "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "aus": 33, "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) model = BertForDocMultiClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = np.array([f.input_ids for f in train_features]) all_input_mask = np.array([f.input_mask for f in train_features]) all_segment_ids = np.array([f.segment_ids for f in train_features]) all_label_ids = np.array([f.label_id for f in train_features]) initial_labeled_samples = 10000 max_queried = 10000 trainset_size = 10000 queried = initial_labeled_samples samplecount = [initial_labeled_samples] selection_function = RandomSelection() train_data, val_data = split(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, trainset_size) (input_ids, input_mask, segment_ids, label_ids) = train_data # initial process by applying base learner to labeled training data set to obtain Classifier permutation, input_ids_train, input_mask_train, segment_ids_train, label_ids_train = \ get_k_random_samples(input_ids, input_mask, segment_ids, label_ids, initial_labeled_samples, trainset_size) # assgin the val set the rest of the "unlabelled" traning data input_ids_val = np.copy(input_ids) input_mask_val = np.copy(input_mask) segment_ids_val = np.copy(segment_ids) label_ids_val = np.copy(label_ids) input_ids_val = np.delete(input_ids_val, permutation, axis=0) input_mask_val = np.delete(input_mask_val, permutation, axis=0) segment_ids_val = np.delete(segment_ids_val, permutation, axis=0) label_ids_val = np.delete(label_ids_val, permutation, axis=0) print('val set:',input_ids_val.shape, label_ids_val.shape, permutation.shape) train_dataloader = load_train_data(args, input_ids_train, input_mask_train, segment_ids_train, label_ids_train) model = train(model, args, n_gpu, optimizer, num_train_optimization_steps, num_labels, train_dataloader, device) model.to(device) active_iteration = 1 while queried < max_queried: print() print("active_iteration:", active_iteration) eval_dataloader = load_eval_data(args, input_ids_val, input_mask_val, segment_ids_val, label_ids_val) probas_val = predict(model, args, eval_dataloader, device) print("val predicted:", probas_val.shape) uncertain_samples = selection_function.select(probas_val, initial_labeled_samples) print("trainset before:", input_ids_train.shape) input_ids_train = np.concatenate((input_ids_train, input_ids_val[uncertain_samples])) input_mask_train = np.concatenate((input_mask_train, input_mask_val[uncertain_samples])) segment_ids_train = np.concatenate((segment_ids_train, segment_ids_val[uncertain_samples])) label_ids_train = np.concatenate((label_ids_train, label_ids_val[uncertain_samples])) samplecount.append(input_ids_train.shape[0]) print("trainset after:", input_ids_train.shape) input_ids_val = np.delete(input_ids_val, uncertain_samples, axis=0) input_mask_val = np.delete(input_mask_val, uncertain_samples, axis=0) segment_ids_val= np.delete(segment_ids_val, uncertain_samples, axis=0) label_ids_val = np.delete(label_ids_val, uncertain_samples, axis=0) print("val set:", input_ids_val.shape) queried += initial_labeled_samples train_dataloader = load_train_data(args, input_ids_train, input_mask_train, segment_ids_train, label_ids_train) model = train(model, args, n_gpu, optimizer, num_train_optimization_steps, num_labels, train_dataloader, device) model.to(device) # logger.info("***** Running evaluation *****") report_path = os.path.join(args.output_dir, "result" + str(queried) + ".csv") # eval(model, args, processor, device, global_step, task_name, label_list, tokenizer, report_path) logger.info("results in"+report_path) print("results in", report_path) active_iteration += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** Running evaluation *****") report_path = os.path.join(args.output_dir, "final_report.csv") eval(model, args, processor, device, global_step, task_name, label_list, tokenizer, report_path)
def __init__(self): self.bert_model = "bert-base-uncased" self.pretrained_model = "./pretrained" self.output_dir = "./tmp" self.max_seq_length = 384 self.doc_stride = 128 self.max_query_length = 64 self.do_predict = True self.predict_batch_size = 8 self.n_best_size = 20 self.max_query_length = 30 self.verbose_logging = False self.no_cuda = False self.seed = 42 self.fp16 = False self.local_rank = -1 self.max_answer_length = 30 self.do_lower_case = True if self.local_rank == -1 or self.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: self.device = torch.device("cuda", self.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if self.fp16: logger.info( "16-bits training currently not supported in distributed training" ) self.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(self.device, n_gpu, bool(self.local_rank != -1), self.fp16)) random.seed(self.seed) np.random.seed(self.seed) torch.manual_seed(self.seed) if n_gpu > 0: torch.cuda.manual_seed_all(self.seed) if os.path.exists(self.output_dir) and os.listdir(self.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(self.output_dir, exist_ok=True) self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) # Prepare model self.model = BertForQuestionAnswering.from_pretrained( self.pretrained_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(self.local_rank)) if self.fp16: self.model.half() self.model.to(self.device) if self.local_rank != -1: self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.local_rank], output_device=self.local_rank) elif n_gpu > 1: self.model = torch.nn.DataParallel(self.model) self.model.eval()
def main(yaml_file): with open(yaml_file) as f: config = yaml.load(f.read()) if not config['train']['do'] and not config['predict']['do']: raise ValueError("At least do training or do predicting in a run.") if config['use_cuda'] and torch.cuda.is_available(): device = torch.device("cuda", torch.cuda.current_device()) use_gpu = True else: device = torch.device("cpu") use_gpu = False logger.info("device: {}".format(device)) task_name = config['task']['task_name'].lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() label_list = processor.get_labels() os.makedirs(config['task']['output_dir'], exist_ok=True) ckpts = [ filename for filename in os.listdir(config['task']['output_dir']) if re.fullmatch('checkpoint-\d+', filename) ] if config['task']['checkpoint'] or ckpts: if config['task']['checkpoint']: model_file = config['task']['checkpoint'] else: model_file = os.path.join( config['task']['output_dir'], sorted(ckpts, key=lambda x: int(x[len('checkpoint-'):]))[-1]) logging.info('Load %s' % model_file) checkpoint = torch.load(model_file, map_location='cpu') start_epoch = checkpoint['epoch'] + 1 max_seq_length = checkpoint['max_seq_length'] lower_case = checkpoint['lower_case'] model = BertForNER.from_pretrained( config['task']['bert_model_dir'], state_dict=checkpoint['model_state'], num_labels=len(label_list)) else: start_epoch = 0 max_seq_length = config['task']['max_seq_length'] lower_case = config['task']['lower_case'] model = BertForNER.from_pretrained(config['task']['bert_model_dir'], num_labels=len(label_list)) tokenizer = BertTokenizer.from_pretrained(config['task']['bert_model_dir'], do_lower_case=lower_case) model.to(device) if config['train']['do']: if config['train']['gradient_accumulation_steps'] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(config['train']['gradient_accumulation_steps'])) config['train']['batch_size'] = int( config['train']['batch_size'] / config['train']['gradient_accumulation_steps']) random.seed(config['train']['seed']) np.random.seed(config['train']['seed']) torch.manual_seed(config['train']['seed']) if use_gpu: torch.cuda.manual_seed_all(config['train']['seed']) train_examples = processor.get_train_examples( config['task']['data_dir']) num_train_steps = int( len(train_examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['epochs']) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config['train']['learning_rate'], warmup=config['train']['warmup_proportion'], t_total=num_train_steps) train_features = convert_examples_to_features( train_examples, max_seq_length, tokenizer, config['task']['standard'], label_list) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", config['train']['batch_size']) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_predict_mask = torch.ByteTensor( [f.predict_mask for f in train_features]) all_one_hot_labels = torch.tensor( [f.one_hot_labels for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_predict_mask, all_one_hot_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config['train']['batch_size']) model.train() global_step = int( len(train_examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * start_epoch) for epoch in trange(start_epoch, config['train']['epochs'], desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, input_mask, predict_mask, one_hot_labels = batch loss = model(input_ids, segment_ids, input_mask, predict_mask, one_hot_labels) if config['train']['gradient_accumulation_steps'] > 1: loss = loss / config['train']['gradient_accumulation_steps'] loss.backward() if (step + 1 ) % config['train']['gradient_accumulation_steps'] == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a checkpoint model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save( { 'epoch': epoch, 'model_state': model_to_save.state_dict(), 'max_seq_length': max_seq_length, 'lower_case': lower_case, 'label_list': label_list }, os.path.join(config['task']['output_dir'], 'checkpoint-%d' % epoch)) if config['predict']['do']: if config['predict']['dataset'] == 'train': predict_examples = processor.get_train_examples( config['task']['data_dir']) elif config['predict']['dataset'] == 'dev': predict_examples = processor.get_dev_examples( config['task']['data_dir']) elif config['predict']['dataset'] == 'test': predict_examples = processor.get_test_examples( config['task']['data_dir']) else: raise ValueError("The dataset %s cannot be predicted." % config['predict']['dataset']) predict_features = convert_examples_to_features( predict_examples, max_seq_length, tokenizer, config['task']['standard'], label_list) logger.info("***** Running prediction *****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", config['predict']['batch_size']) all_input_ids = torch.tensor([f.input_ids for f in predict_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in predict_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in predict_features], dtype=torch.long) predict_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask) # Run prediction for full data predict_sampler = SequentialSampler(predict_data) predict_dataloader = DataLoader( predict_data, sampler=predict_sampler, batch_size=config['predict']['batch_size']) model.eval() predictions = [] for batch in predict_dataloader: batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, input_mask = batch logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() predictions.extend(np.argmax(logits, -1).tolist()) writer = codecs.open(os.path.join( config['task']['output_dir'], "%s.predict" % config['predict']['dataset']), 'w', encoding='utf-8') for predict_line, feature in zip(predictions, predict_features): predict_labels = [] for index, label_id in enumerate( predict_line[:sum(feature.input_mask)]): if feature.predict_mask[index] == 1: predict_labels.append(label_list[label_id]) writer.write(' '.join(predict_labels) + '\n') writer.close()