def main(): """Main training program.""" print('Generate Samples') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) #get the tokenizer tokenizer = GPT2Tokenizer( os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # Model model = setup_model(args) #setting default batch size to 1 args.batch_size = 1 #generate samples generate_samples(model, tokenizer, args, torch.cuda.current_device())
def __init__(self, cache_dir=None): self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir) self.tokenizer.max_len = int(1e12) self.eod_token = self.tokenizer.encoder['<|endoftext|>'] assert self.eod_token < 65535, 'vocab size will not fit in uint16' print( '> GPT2 tokenizer with {} vocab size and eod token {} ...'.format( len(self.tokenizer.encoder), self.eod_token))
def make_gpt2_dataloaders(args): # Input parameters. input_data_sizes_file = args.input_data_sizes_file seq_length = args.seq_length initial_seed = args.seed # Data parallel arguments. world_size = mpu.get_data_parallel_world_size() rank = mpu.get_data_parallel_rank() global_batch_size = args.batch_size * world_size num_workers = args.num_workers def make_data_loader_(data_path): # Build the dataset. dataset = GPT2Dataset(data_path, input_data_sizes_file, seq_length, initial_seed) # Use a simple sampler with distributed batch sampler. sampler = torch.utils.data.SequentialSampler(dataset) batch_sampler = DistributedBatchSampler(sampler=sampler, batch_size=global_batch_size, drop_last=True, rank=rank, world_size=world_size) # Torch dataloader. return torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=num_workers, pin_memory=True) train = make_data_loader_(args.train_data_path) valid = make_data_loader_(args.val_data_path) test = make_data_loader_(args.test_data_path) args.do_train = False args.do_valid = False args.do_test = False if train is not None: args.do_train = True if valid is not None: args.do_valid = True if test is not None: args.do_test = True # Tokenizer. tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir) eod_token = tokenizer.encoder['<|endoftext|>'] num_tokens = eod_token + 1 return (train, valid, test), num_tokens, eod_token
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) #get the tokenizer tokenizer = GPT2Tokenizer(os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # load data assert args.eval_data_path is not None device = torch.cuda.current_device() args.eod_token = tokenizer.encoder['<eod>'] # Model args.parallel_output = True model = setup_model(args) if args.task == "ocnli": dev_dataloader = load_ocnli_data(args.eval_data_path, 'dev', tokenizer) evaluate_ocnli(model, dev_dataloader, device, args) elif args.task == "iflytek": dev_dataloader, all_labels = load_iflytek_data(args.eval_data_path, 'dev', tokenizer) evaluate(model, dev_dataloader, all_labels, device, args) elif args.task == "tnews": dev_dataloader, all_labels = load_tnews_data(args.eval_data_path, 'dev', tokenizer) evaluate(model, dev_dataloader, all_labels, device, args) else: print("Unknown task!")
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # get the tokenizer tokenizer = GPT2Tokenizer( os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # load data test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1) # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval. args.train_iters = 1 # Model model, _, _ = setup_model_and_optimizer(args) device = torch.cuda.current_device() # give a time stemp to the model cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) results_dir = os.path.join(args.results_dir, "{}-{}".format(args.model_name, cur_time)) if torch.distributed.get_rank() == 0: os.makedirs(results_dir, exist_ok=True) model.eval() all_sids = [] all_cids = [] all_losses = [] with torch.no_grad(): for batch, no_model_batch in tqdm( test_dataloader, desc="Evaluating", disable=(torch.distributed.get_rank() != 0)): for k in batch: batch[k] = batch[k].to(device) for k in no_model_batch: no_model_batch[k] = no_model_batch[k].to(device) output = model(**batch) losses = mpu.vocab_parallel_cross_entropy( output.contiguous().float(), no_model_batch["labels"]) loss_mask = no_model_batch["loss_mask"] loss = torch.sum(losses * loss_mask, dim=-1) / loss_mask.sum(dim=-1) loss_tensor_list = [ torch.zeros_like(loss).to(device) for _ in range(mpu.get_data_parallel_world_size()) ] torch.distributed.all_gather(loss_tensor_list, loss.data, group=mpu.get_data_parallel_group()) all_losses.extend(loss_tensor_list) sids = no_model_batch["sids"] sid_tensor_list = [ torch.zeros_like(sids) for _ in range(mpu.get_data_parallel_world_size()) ] torch.distributed.all_gather(sid_tensor_list, sids.data, group=mpu.get_data_parallel_group()) all_sids.extend(sid_tensor_list) cids = no_model_batch["cids"] cid_tensor_list = [ torch.zeros_like(cids) for _ in range(mpu.get_data_parallel_world_size()) ] torch.distributed.all_gather(cid_tensor_list, cids.data, group=mpu.get_data_parallel_group()) all_cids.extend(cid_tensor_list) if torch.distributed.get_rank() == 0: all_losses = torch.stack(all_losses).view(-1).cpu().detach().numpy() all_sids = torch.stack(all_sids).view(-1).cpu().detach().numpy() all_cids = torch.stack(all_cids).view(-1).cpu().detach().numpy() truth_labels = test_dataset.truth_labels preds = [[] for _ in truth_labels] for sid, cid, loss in zip(all_sids, all_cids, all_losses): preds[sid].append((cid, loss)) preds = [min(p, key=lambda x: x[1])[0] for p in preds if len(p) > 0] yprint("Acc: {}".format( sum([int(p == l) for p, l in zip(preds, truth_labels)]) / len(truth_labels))) with open(os.path.join(results_dir, "zero-shot_result.txt"), "w") as f: f.write("Acc: {}\n".format( sum([int(p == l) for p, l in zip(preds, truth_labels)]) / len(truth_labels))) torch.distributed.barrier()
parser.add_argument("--data_dir", default=None, type=str, help="The input dir of original ChID data.") parser.add_argument("--tokenizer_path", type=str, help="The tokenizer path.", default="./bpe_3w_new") parser.add_argument("--output_dir", type=str, help="The processed data output dir.") args = parser.parse_args() tokenizer = GPT2Tokenizer( os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) os.makedirs(args.output_dir, exist_ok=True) # for split in ["train", "dev", "test"]: for split in ["train", "dev"]: with open(os.path.join(args.data_dir, "{}.json".format(split)), "r") as f: lines = f.readlines() # with open(os.path.join(args.data_dir, "{}_answer.json".format(split)), "r") as f: # ans_d = json.load(f) num_ids, all_data = preprocess(lines, tokenizer, split) # num_ids, all_data = preprocess((lines, ans_d), tokenizer, split)
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # get the tokenizer tokenizer = GPT2Tokenizer(os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # load train data if args.do_train: train_dataloader, _ = load_data(args, 'train', tokenizer, 1) dev_dataloader, dev_dataset = load_data(args, 'dev', tokenizer, 1) with open(args.deepspeed_config, "r") as f: deepspeed_conf = json.load(f) epoch = args.epoch grad_acc = deepspeed_conf["gradient_accumulation_steps"] args.train_iters = len(train_dataloader) * epoch / grad_acc # Model, optimizer, and learning rate. # TODO: maybe need to reinitialize optimizer elif args.do_eval: # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval. args.train_iters = 1 model, optimizer, lr_scheduler = setup_model_and_optimizer_C(args) device = torch.cuda.current_device() # give a time stemp to the model cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) results_dir = os.path.join(args.results_dir, "{}-{}".format(args.model_name, cur_time)) os.makedirs(results_dir, exist_ok=True) if args.do_train and torch.distributed.get_rank() == 0: with open(os.path.join(results_dir, "train_log.txt"), "w") as f: f.write("Train losses:\n") with open(os.path.join(results_dir, "dev_log.txt"), "w") as f: f.write("Dev accs:\n") torch.distributed.barrier() if args.do_train: # cand_ids = torch.tensor(dev_dataset.cand_ids).to(device) total_loss, logging_loss, best_acc = 0.0, 0.0, 0.0 global_step, total_step, best_step = 0, 0, 0 for e in range(epoch): model.train() for batch, no_model_batch in tqdm(train_dataloader, disable=(torch.distributed.get_rank() != 0)): for k in batch: batch[k] = batch[k].to(device) for k in no_model_batch: no_model_batch[k] = no_model_batch[k].to(device) output = model(**batch) # get the loss of the last token output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(no_model_batch["loss_mask"], -1).unsqueeze(-1) # get the label of the last token # labels = no_model_batch["labels"].float() labels = no_model_batch["truth"].float() # labels = (torch.sum(labels * no_model_batch["loss_mask"], 1) / torch.sum(no_model_batch["loss_mask"], -1)).long() # cross_entropy loss # losses = mpu.vocab_parallel_cross_entropy(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1)) losses = CrossEntropyLoss(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1)) loss = torch.mean(losses) model.backward(loss) model.step() torch.distributed.all_reduce(loss.data, group=mpu.get_data_parallel_group()) loss.data = loss.data / mpu.get_data_parallel_world_size() total_loss += loss.item() / grad_acc if total_step % grad_acc == 0: global_step += 1 if global_step != 0 and global_step % args.log_interval == 0: # logging if torch.distributed.get_rank() == 0: train_log = "Epoch {}, global step {}, total step {}, train lm loss: {}".format(e, global_step, epoch * len(train_dataloader), (total_loss - logging_loss) / args.log_interval) yprint(train_log) with open(os.path.join(results_dir, "train_log.txt"), "a") as f: f.write(train_log + "\n") logging_loss = total_loss if global_step != 0 and global_step % args.eval_interval == 0: # evaluate on the dev acc, _, _ = evaluate_tnews(args, model, dev_dataloader, device, mode="dev") dev_results_dir = os.path.join(results_dir, "dev_step-{}".format(global_step)) if acc > best_acc: best_acc = acc best_step = global_step if torch.distributed.get_rank() == 0: # we will only write the log file once dev_log = "Epoch: {}, Global step: {}, Acc: {}".format(e, global_step, acc) yprint(dev_log) os.makedirs(dev_results_dir, exist_ok=True) with open(os.path.join(dev_results_dir, "dev_result.txt"), "w") as f: f.write(dev_log + "\n") with open(os.path.join(results_dir, "dev_log.txt"), "a") as f: f.write(dev_log + "\n") torch.distributed.barrier() args.save = dev_results_dir save_checkpoint(global_step, model, optimizer, lr_scheduler, args) total_step += 1 with open(os.path.join(dev_results_dir, "dev_log.txt"), "a") as f: f.write("Best acc: {} Best step: {}\n".format(best_acc, best_step)) if args.do_eval: # evaluate on the test test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1) cand_ids = torch.tensor(test_dataset.cand_ids).to(device) if args.do_train: # if do training, then evaluate the one with the max acc on dev set. eval_ckpt_path = os.path.join(results_dir, "dev_step-{}".format(best_step)) args.load = eval_ckpt_path else: # if only do eval, then evaluate the one specified by the user. args.load = args.eval_ckpt_path load_checkpoint(model=model, optimizer=None, lr_scheduler=None, args=args) acc, _, _ = evaluate(args, model, test_dataloader, cand_ids, device, mode="test") if torch.distributed.get_rank() == 0: eval_log = "Checkpoint from {}: Acc: {}".format(args.load, acc) yprint(eval_log) with open(os.path.join(results_dir, "eval_log"), "w") as f: f.write(eval_log + "\n") torch.distributed.barrier()