def __init__(self, actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=None, eps=None, max_grad_norm=None, use_clipped_value_loss=True, optimizer='adam', beta1=0.0, beta2=0.999): # betas not passed to optimizers self.actor_critic = actor_critic self.clip_param = clip_param self.ppo_epoch = ppo_epoch self.num_mini_batch = num_mini_batch self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss if optimizer == 'adam': print("using adam optimizer!") self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps, betas=(0.0, 0.999)) elif optimizer == 'lamb': print("using lamb optimizer!") self.optimizer = Lamb(actor_critic.parameters(), lr=lr, eps=eps, betas=(0.0, 0.999)) elif optimizer == 'sgd': print("using SGD optimizer!") self.optimizer = optim.SGD(actor_critic.parameters(), lr=lr, momentum=0.0) elif optimizer == 'nero': print("using nero optimizer!") self.optimizer = Nero(actor_critic.parameters(), lr=lr)
def create_optimizer_and_scheduler(self, num_training_steps: int): """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. """ if self.optimizer is None: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] if self.args.optimizer_str == "adamw": self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, betas=(self.args.adam_beta1, self.args.adam_beta2), eps=self.args.adam_epsilon, ) elif self.args.optimizer_str == "lamb": self.optimizer = Lamb(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) else: raise NotImplementedError("Optimizer must be adamw or lamb") if self.lr_scheduler is None: self.lr_scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps)
def model_fn(config): max_seq_length = config['max_seq_length'] input_ids = keras.Input(shape=(max_seq_length, ), name='input_ids') attention_mask = keras.Input(shape=(max_seq_length, ), name='attention_mask') segment_ids = keras.Input(shape=(max_seq_length, ), name='segment_ids') albert = Bert(config) first_tokens, output = albert([input_ids, attention_mask, segment_ids]) # albert_model = keras.Model(inputs=[input_ids, attention_mask, segment_ids], outputs=output) albert_model = keras.Model(inputs={ 'input_ids': input_ids, 'attention_mask': attention_mask, 'segment_ids': segment_ids }, outputs=[first_tokens, output]) lm_predictor = PreTrainLMPredictor( config['input_hidden'], config['vocab_size'], max_seq_length, ) lm_predict_outputs = lm_predictor( [output, albert.embedding.embeddings, albert.projection]) next_sentence_predictor = PreTrainNextSentencePredictor(2) next_sentence_predict_outputs = next_sentence_predictor(first_tokens) classifier_model = keras.Model( inputs={ 'input_ids': input_ids, 'attention_mask': attention_mask, 'segment_ids': segment_ids }, outputs=[lm_predict_outputs, next_sentence_predict_outputs]) # Optimizer lamb_optimizer = Lamb() return classifier_model, albert_model, lamb_optimizer
def main(lr=0.1): global best_acc args.lr = lr device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') # net = VGG('VGG19') # net = ResNet18() # net = PreActResNet18() # net = GoogLeNet() # net = DenseNet121() # net = ResNeXt29_2x64d() # net = MobileNet() # net = MobileNetV2() # net = DPN92() # net = ShuffleNetG2() # net = SENet18() # net = ShuffleNetV2(1) # net = EfficientNetB0() # net = RegNetX_200MF() net = ResNet50() net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) train_acc = [] valid_acc = [] # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() # lr_scheduler.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print(100. * correct / total) train_acc.append(correct / total) def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 print('test') with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() # Save checkpoint. acc = 100. * correct / total print(acc) valid_acc.append(correct / total) if acc > best_acc: print('Saving..') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, ckpt) best_acc = acc for epoch in range(200): if epoch in args.lr_decay: checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] args.lr *= 0.1 if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': # no tensorboardX optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, dampening=args.damping) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) train(epoch) test(epoch) file = open(args.optimizer + str(lr) + 'log.json', 'w+') json.dump([train_acc, valid_acc], file) return best_acc
# NUM_LAYERS = 3 NUM_DIRECTIONS = 2 # DROPOUT = 0.3 device = torch.device('cuda') model = DocumentReader(args.hidden_dim, args.emb_dim, args.layers, NUM_DIRECTIONS, args.dropout, device).to(device) model = torch.nn.DataParallel(model) writer = SummaryWriter(comment="_nlp_%s_%s_%s" % (args.optimizer, args.batch_size, args.learning_rate)) weight_decay = args.learning_rate / args.epochs if args.optimizer == 'lamb': optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=weight_decay, betas=(.9, .999), adam=False, writer=writer) elif args.optimizer == 'lars': base_optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=weight_decay) optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001, writer=writer) elif args.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=weight_decay, writer=writer) else: # use adam optimizer optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=weight_decay, betas=(.9, .999), adam=True, writer=writer) print(f'The model has {count_parameters(model):,} trainable parameters') ckpt_dir_name = "%s_%s_%s" % (args.working_dir, args.optimizer, args.batch_size) model, optimizer = load_pretrained_model(model, optimizer, "%s/ckpt/%s" % (ckpt_dir_name, "best_weights.pt"))
if __name__ == "__main__": # tokenizer = RobertaTokenizerFast.from_pretrained( # "roberta-large" if not bool(int(os.environ.get("ROBERTA"))) else "xlm-roberta-base") # model = DualEncoderPerformer(num_tokens=tokenizer.vocab_size, max_seq_len=512, dim=512, depth=6, heads=8) # distilbert-base-multilingual-cased # distilroberta-base # DeepPavlov/bert-base-multilingual-cased-sentence # bert-base-multilingual-cased tokenizer = AutoTokenizer.from_pretrained( "distilbert-base-multilingual-cased") model = DualEncoder() model.save_pretrained("./results/test.bin") model = DualEncoder.from_pretrained("./results/test.bin") optimizer = Lamb(model.parameters(), lr=0.001) # Lamb sentence1_tensor = tokenizer( ["Ich bin Andre", "Ich brauche hilfe", "Du magst tanzen?"], add_special_tokens=True, return_tensors="pt", padding=True) sentence2_tensor = tokenizer( ["I am Andre", "I need support", "do you like dancing?"], add_special_tokens=True, return_tensors="pt", padding=True) sentence1_test = tokenizer(["Ich bin Andre", "Ich bin Andre"], add_special_tokens=True, return_tensors="pt", padding=True)
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd) elif args.optimizer == 'adam': print("using adam!") optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(args.momentum, args.beta), weight_decay=args.wd) elif args.optimizer == 'lamb': print("using lamb!") optimizer = Lamb(net.parameters(), lr=args.lr, betas=(args.momentum, args.beta), weight_decay=args.wd) elif args.optimizer == 'lambcs': print("using lambcs!") optimizer = LambCS(net.parameters(), lr=args.lr, betas=(args.momentum, args.beta), weight_decay=args.wd, constraints=True) elif args.optimizer == 'madam': print("using madam!") optimizer = Madam(net.parameters(), lr=args.lr) elif args.optimizer == 'madamcs': print("using madamcs!")
if args.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'lars': #no tensorboardX optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=0.01) optname = args.optimizer if len(sys.argv) >= 2 else 'sgd' # log = open(optname+'log.txt','w+') log = None criterion = nn.CrossEntropyLoss() model, optimizer, _ = training_loop(model, criterion, optimizer, train_loader, valid_loader, N_EPOCHS, DEVICE, log)
def train(args, model, tokenizer, query_cache, passage_cache): """ Train the model """ #if args.local_rank in [-1, 0]: tb_writer = None if is_first_worker(): tb_writer = SummaryWriter(log_dir=args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1) # layerwise optimization for lamb optimizer_grouped_parameters = [] for layer_name in [ "roberta.embeddings", "score_out", "downsample1", "downsample2", "downsample3" ]: layer = getattr_recursive(model, layer_name) if layer is not None: optimizer_grouped_parameters.append({"params": layer.parameters()}) if getattr_recursive(model, "roberta.encoder.layer") is not None: for layer in model.roberta.encoder.layer: optimizer_grouped_parameters.append({"params": layer.parameters()}) if len(optimizer_grouped_parameters) == 0: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] if args.optimizer.lower() == "lamb": optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer.lower() == "adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: raise Exception( "optimizer {0} not recognized! Can only be lamb or adamW".format( args.optimizer)) # Check if saved optimizer or scheduler states exist if os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and args.load_optimizer_scheduler: # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") #logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Max steps = %d", args.max_steps) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) global_step = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path if "-" in args.model_name_or_path: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) else: global_step = 0 logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from global step %d", global_step) tr_loss = 0.0 model.zero_grad() model.train() set_seed(args) # Added here for reproductibility last_ann_no = -1 train_dataloader = None train_dataloader_iter = None dev_ndcg = 0 step = 0 if args.single_warmup: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps) while global_step < args.max_steps: if step % args.gradient_accumulation_steps == 0 and global_step % args.logging_steps == 0: # check if new ann training data is availabe ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir) if ann_path is not None and ann_no != last_ann_no: logger.info("Training on new add data at %s", ann_path) with open(ann_path, 'r') as f: ann_training_data = f.readlines() dev_ndcg = ndcg_json['ndcg'] ann_checkpoint_path = ndcg_json['checkpoint'] ann_checkpoint_no = get_checkpoint_no(ann_checkpoint_path) aligned_size = (len(ann_training_data) // args.world_size) * args.world_size ann_training_data = ann_training_data[:aligned_size] logger.info("Total ann queries: %d", len(ann_training_data)) if args.triplet: train_dataset = StreamingDataset( ann_training_data, GetTripletTrainingDataProcessingFn( args, query_cache, passage_cache)) else: train_dataset = StreamingDataset( ann_training_data, GetTrainingDataProcessingFn(args, query_cache, passage_cache)) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size) train_dataloader_iter = iter(train_dataloader) # re-warmup if not args.single_warmup: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=len(ann_training_data)) if args.local_rank != -1: dist.barrier() if is_first_worker(): # add ndcg at checkpoint step used instead of current step tb_writer.add_scalar("dev_ndcg", dev_ndcg, ann_checkpoint_no) if last_ann_no != -1: tb_writer.add_scalar("epoch", last_ann_no, global_step - 1) tb_writer.add_scalar("epoch", ann_no, global_step) last_ann_no = ann_no try: batch = next(train_dataloader_iter) except StopIteration: logger.info("Finished iterating current dataset, begin reiterate") train_dataloader_iter = iter(train_dataloader) batch = next(train_dataloader_iter) batch = tuple(t.to(args.device) for t in batch) step += 1 if args.triplet: inputs = { "query_ids": batch[0].long(), "attention_mask_q": batch[1].long(), "input_ids_a": batch[3].long(), "attention_mask_a": batch[4].long(), "input_ids_b": batch[6].long(), "attention_mask_b": batch[7].long() } else: inputs = { "input_ids_a": batch[0].long(), "attention_mask_a": batch[1].long(), "input_ids_b": batch[3].long(), "attention_mask_b": batch[4].long(), "labels": batch[6] } # sync gradients only at gradient accumulation step if step % args.gradient_accumulation_steps == 0: outputs = model(**inputs) else: with model.no_sync(): outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: if step % args.gradient_accumulation_steps == 0: loss.backward() else: with model.no_sync(): loss.backward() tr_loss += loss.item() if step % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} loss_scalar = tr_loss / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar tr_loss = 0 if is_first_worker(): for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) logger.info(json.dumps({**logs, **{"step": global_step}})) if is_first_worker( ) and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.local_rank == -1 or torch.distributed.get_rank() == 0: tb_writer.close() return global_step
def __init__(self, G_ch=64, dim_z=128, bottom_width=4, resolution=128, G_kernel_size=3, G_attn='64', n_classes=1000, num_G_SVs=1, num_G_SV_itrs=1, G_shared=True, shared_dim=0, hier=False, cross_replica=False, mybn=False, G_activation=nn.ReLU(inplace=False), G_lr=5e-5, G_B1=0.0, G_B2=0.999, adam_eps=1e-8, BN_eps=1e-5, SN_eps=1e-12, G_mixed_precision=False, G_fp16=False, G_init='ortho', skip_init=False, no_optim=False, G_param='SN', norm_style='bn', lamb=False, **kwargs): super(Generator, self).__init__() # Channel width mulitplier self.ch = G_ch # Dimensionality of the latent space self.dim_z = dim_z # The initial spatial dimensions self.bottom_width = bottom_width # Resolution of the output self.resolution = resolution # Kernel size? self.kernel_size = G_kernel_size # Attention? self.attention = G_attn # number of classes, for use in categorical conditional generation self.n_classes = n_classes # Use shared embeddings? self.G_shared = G_shared # Dimensionality of the shared embedding? Unused if not using G_shared self.shared_dim = shared_dim if shared_dim > 0 else dim_z # Hierarchical latent space? self.hier = hier # Cross replica batchnorm? self.cross_replica = cross_replica # Use my batchnorm? self.mybn = mybn # nonlinearity for residual blocks self.activation = G_activation # Initialization style self.init = G_init # Parameterization style self.G_param = G_param # Normalization style self.norm_style = norm_style # Epsilon for BatchNorm? self.BN_eps = BN_eps # Epsilon for Spectral Norm? self.SN_eps = SN_eps # fp16? self.fp16 = G_fp16 # Architecture dict self.arch = G_arch(self.ch, self.attention)[resolution] # If using hierarchical latents, adjust z if self.hier: # Number of places z slots into self.num_slots = len(self.arch['in_channels']) + 1 self.z_chunk_size = (self.dim_z // self.num_slots) # Recalculate latent dimensionality for even splitting into chunks self.dim_z = self.z_chunk_size * self.num_slots else: self.num_slots = 1 self.z_chunk_size = 0 #print("G PARAM: {}".format(self.G_param)) # Which convs, batchnorms, and linear layers to use if self.G_param == 'SN': self.which_conv = functools.partial(layers.SNConv2d, kernel_size=3, padding=1, num_svs=num_G_SVs, num_itrs=num_G_SV_itrs, eps=self.SN_eps) self.which_linear = functools.partial(layers.SNLinear, num_svs=num_G_SVs, num_itrs=num_G_SV_itrs, eps=self.SN_eps) else: self.which_conv = functools.partial(nn.Conv2d, kernel_size=3, padding=1) self.which_linear = nn.Linear # We use a non-spectral-normed embedding here regardless; # For some reason applying SN to G's embedding seems to randomly cripple G self.which_embedding = nn.Embedding bn_linear = (functools.partial(self.which_linear, bias=False) if self.G_shared else self.which_embedding) self.which_bn = functools.partial(layers.ccbn, which_linear=bn_linear, cross_replica=self.cross_replica, mybn=self.mybn, input_size=(self.shared_dim + self.z_chunk_size if self.G_shared else self.n_classes), norm_style=self.norm_style, eps=self.BN_eps) # Prepare model # If not using shared embeddings, self.shared is just a passthrough self.shared = (self.which_embedding(n_classes, self.shared_dim) if G_shared else layers.identity()) # First linear layer self.linear = self.which_linear(self.dim_z // self.num_slots, self.arch['in_channels'][0] * (self.bottom_width **2)) # self.blocks is a doubly-nested list of modules, the outer loop intended # to be over blocks at a given resolution (resblocks and/or self-attention) # while the inner loop is over a given block self.blocks = [] for index in range(len(self.arch['out_channels'])): self.blocks += [[layers.GBlock(in_channels=self.arch['in_channels'][index], out_channels=self.arch['out_channels'][index], which_conv=self.which_conv, which_bn=self.which_bn, activation=self.activation, upsample=(functools.partial(F.interpolate, scale_factor=2) if self.arch['upsample'][index] else None))]] # If attention on this block, attach it to the end if self.arch['attention'][self.arch['resolution'][index]]: print('Adding attention layer in G at resolution %d' % self.arch['resolution'][index]) self.blocks[-1] += [layers.Attention(self.arch['out_channels'][index], self.which_conv)] # Turn self.blocks into a ModuleList so that it's all properly registered. self.blocks = nn.ModuleList([nn.ModuleList(block) for block in self.blocks]) # output layer: batchnorm-relu-conv. # Consider using a non-spectral conv here self.output_layer = nn.Sequential(layers.bn(self.arch['out_channels'][-1], cross_replica=self.cross_replica, mybn=self.mybn), self.activation, self.which_conv(self.arch['out_channels'][-1], 3)) # Initialize weights. Optionally skip init for testing. if not skip_init: self.init_weights() # DEBUG / TODO: remove print("LAMB: {}".format(lamb)) # Set up optimizer # If this is an EMA copy, no need for an optim, so just return now if no_optim: return self.lr, self.B1, self.B2, self.adam_eps = G_lr, G_B1, G_B2, adam_eps if G_mixed_precision: print('Using fp16 adam in G...') import utils self.optim = utils.Adam16(params=self.parameters(), lr=self.lr, betas=(self.B1, self.B2), weight_decay=0, eps=self.adam_eps) elif lamb: self.optim = Lamb(self.parameters(), lr=self.lr, weight_decay=0, betas=(self.B1, self.B2), eps=self.adam_eps, adam=False) else: self.optim = optim.Adam(params=self.parameters(), lr=self.lr, betas=(self.B1, self.B2), weight_decay=0, eps=self.adam_eps)
n_data_test = len(dataset_test) batch_size = 2**10 dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=4) dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=4) print(n_data_train) print(n_data_test) lr = 3e-4 betas = (0.9, 0.999) optimizer = Lamb(model.parameters(), lr=lr, betas=betas, weight_decay=0.1) #optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=betas) #optimizer = torch.optim.SGD(model.parameters(), lr=lr)#, betas=betas) n_epochs = 1000 lossliste = torch.zeros(n_epochs).to(device) accliste_train = torch.zeros(n_epochs).to(device) accliste_test = torch.zeros(n_epochs).to(device) param_idx = 0 print("gogogo") for epoch in range(start_epoch, n_epochs + start_epoch): epochstart = dt.now().timestamp() total_loss = 0 acc_train = 0 acc_test = 0
def train(args, train_dataset, model_d, model_g, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_d_grouped_parameters = [ { "params": [ p for n, p in model_d.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model_d.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] # optimizer_d = AdamW(optimizer_d_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer_d = Lamb(optimizer_d_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.999), eps=1e-6) scheduler_d = get_linear_schedule_with_warmup( optimizer_d, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) optimizer_g_grouped_parameters = [ { "params": [ p for n, p in model_g.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model_g.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] # optimizer_g = AdamW(optimizer_g_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer_g = Lamb(optimizer_g_grouped_parameters, lr=args.learning_rate, betas=(0.9, 0.999), eps=1e-6) scheduler_g = get_linear_schedule_with_warmup( optimizer_g, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer_d.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler_d.pt")): # Load in optimizer and scheduler states optimizer_d.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer_d.pt"))) scheduler_d.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler_d.pt"))) if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer_g.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler_g.pt")): # Load in optimizer and scheduler states optimizer_g.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer_g.pt"))) scheduler_g.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler_g.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model_d, optimizer_d = amp.initialize(model_d, optimizer_d, opt_level=args.fp16_opt_level) model_g, optimizer_g = amp.initialize(model_g, optimizer_g, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_d = torch.nn.DataParallel(model_d) model_g = torch.nn.DataParallel(model_g) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_d = torch.nn.parallel.DistributedDataParallel( model_d, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) model_g = torch.nn.parallel.DistributedDataParallel( model_g, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) model_to_resize_d = model_d.module if hasattr( model_d, "module") else model_d # Take care of distributed/parallel training # model_to_resize_d.resize_token_embeddings(len(tokenizer)) model_to_resize_g = model_g.module if hasattr( model_g, "module") else model_g # Take care of distributed/parallel training # model_to_resize_g.resize_token_embeddings(len(tokenizer)) # model_to_resize_d.bert.embeddings = model_to_resize_g.bert.embeddings tr_loss, logging_loss = 0.0, 0.0 tr_loss_d, logging_loss_d = 0.0, 0.0 tr_loss_g, logging_loss_g = 0.0, 0.0 model_d.zero_grad() model_g.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model_d.train() model_g.train() # batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids # outputs = model(**inputs) # loss = outputs[0] # model outputs are always tuple in transformers (see doc) masked_input_ids, mask_labels = mask_tokens( inputs['input_ids'], tokenizer, args) outputs_g = model_g( input_ids=masked_input_ids.to(args.device), masked_lm_labels=mask_labels.to(args.device), attention_mask=inputs['attention_mask'].to(args.device), token_type_ids=inputs['token_type_ids'].to(args.device)) masked_lm_loss, prediction_scores_g = outputs_g[0], outputs_g[1] prediction_g = prediction_scores_g.max(dim=-1)[1].cpu() acc_g = (prediction_g[mask_labels >= 0] == mask_labels[ mask_labels >= 0]).float().mean().item() prediction_probs_g = F.softmax(prediction_scores_g, dim=-1).cpu() bsz, seq_len, vocab_size = prediction_probs_g.size() prediction_samples_g = torch.multinomial(prediction_probs_g.view( -1, vocab_size), num_samples=1) prediction_samples_g = prediction_samples_g.view(bsz, seq_len) input_ids_replace = inputs['input_ids'].clone() input_ids_replace[mask_labels >= 0] = prediction_samples_g[ mask_labels >= 0] labels_d = input_ids_replace.eq(inputs['input_ids']).long() special_tokens_mask = [ tokenizer.get_special_tokens_mask( val, already_has_special_tokens=True) for val in inputs['input_ids'].tolist() ] labels_d.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=-100) padding_mask = inputs['input_ids'].eq(tokenizer.pad_token_id) labels_d.masked_fill_(padding_mask, value=-100) labels_d_ones = labels_d[labels_d >= 0].float().mean().item() acc_replace = 1 - ((labels_d == 0).sum().float() / (mask_labels >= 0).sum().float()).item() outputs_d = model_d( input_ids=input_ids_replace.to(args.device), attention_mask=inputs['attention_mask'].to(args.device), token_type_ids=inputs['token_type_ids'].to(args.device), labels=labels_d.to(args.device)) loss_d, prediction_scores_d = outputs_d[0], outputs_d[1] prediction_d = prediction_scores_d.max(dim=-1)[1].cpu() acc_d = (prediction_d[labels_d >= 0] == labels_d[labels_d >= 0] ).float().mean().item() acc_d_0 = (prediction_d[labels_d == 0] == labels_d[labels_d == 0] ).float().mean().item() acc_d_1 = (prediction_d[labels_d == 1] == labels_d[labels_d == 1] ).float().mean().item() if args.n_gpu > 1: loss_d = loss_d.mean( ) # mean() to average on multi-gpu parallel training masked_lm_loss = masked_lm_loss.mean() if args.gradient_accumulation_steps > 1: loss_d = loss_d / args.gradient_accumulation_steps masked_lm_loss = masked_lm_loss / args.gradient_accumulation_steps lambd = 50 loss = loss_d * lambd + masked_lm_loss if args.fp16: loss_d = loss_d * lambd with amp.scale_loss(loss_d, optimizer_d) as scaled_loss_d: scaled_loss_d.backward() with amp.scale_loss(masked_lm_loss, optimizer_g) as scaled_loss_g: scaled_loss_g.backward() else: loss.backward() tr_loss += loss.item() tr_loss_d += loss_d.item() tr_loss_g += masked_lm_loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer_d), args.max_grad_norm) torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer_g), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_d.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm_(model_g.parameters(), args.max_grad_norm) optimizer_d.step() scheduler_d.step() # Update learning rate schedule model_d.zero_grad() optimizer_g.step() scheduler_g.step() # Update learning rate schedule model_g.zero_grad() if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} # if ( # args.local_rank == -1 and args.evaluate_during_training # ): # Only evaluate when single GPU otherwise metrics may not average well # results = evaluate(args, model, tokenizer) # for key, value in results.items(): # eval_key = "eval_{}".format(key) # logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps loss_scalar_d = (tr_loss_d - logging_loss_d) / args.logging_steps loss_scalar_g = (tr_loss_g - logging_loss_g) / args.logging_steps learning_rate_scalar_d = scheduler_d.get_lr()[0] learning_rate_scalar_g = scheduler_g.get_lr()[0] logs["learning_rate_d"] = learning_rate_scalar_d logs["learning_rate_g"] = learning_rate_scalar_g logs["loss"] = loss_scalar logs["loss_d"] = loss_scalar_d logs["loss_g"] = loss_scalar_g logs["acc_repalce"] = acc_replace logs["acc_d"] = acc_d logs["acc_d_0"] = acc_d_0 logs["acc_d_1"] = acc_d_1 logs["acc_g"] = acc_g logs["labels_d_ones"] = labels_d_ones logs["masked_ratio"] = (mask_labels >= 0).float().sum( ).item() / (labels_d >= 0).sum().float().item() logging_loss = tr_loss logging_loss_d = tr_loss_d logging_loss_g = tr_loss_g for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) # print(args.save_steps) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) output_dir_d = os.path.join( output_dir, "checkpoint-d-{}".format(global_step)) output_dir_g = os.path.join( output_dir, "checkpoint-g-{}".format(global_step)) if not os.path.exists(output_dir_d): os.makedirs(output_dir_d) if not os.path.exists(output_dir_g): os.makedirs(output_dir_g) model_to_save_d = ( model_d.module if hasattr(model_d, "module") else model_d) # Take care of distributed/parallel training model_to_save_g = ( model_g.module if hasattr(model_g, "module") else model_g) # Take care of distributed/parallel training model_to_save_d.save_pretrained(output_dir_d) model_to_save_g.save_pretrained(output_dir_g) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer_d.state_dict(), os.path.join(output_dir_d, "optimizer_d.pt")) torch.save(scheduler_d.state_dict(), os.path.join(output_dir_d, "scheduler_d.pt")) torch.save(optimizer_g.state_dict(), os.path.join(output_dir_d, "optimizer_g.pt")) torch.save(scheduler_g.state_dict(), os.path.join(output_dir_d, "scheduler_g.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
mlp_dim=param, # 64, # Dimension des MLPs im Transformer transformer_dropout=0., #1, # Dropout des MLP im Transformer num_classes=num_classes, # Anzahl Klassen use_mlp=True # benutze MLP im Encoder ).to(device) #model = VisualTransformer(inner_dim=p, transformer_depth=1, dim_head=49, attn_heads=3, mlp_dim=49, num_classes=num_classes).to(device) #model = torch.load("models/"+modelnames[param_idx]+".pt") print(sum([params.numel() for params in model.parameters()])) print("mlp_dim", param) with open("where.txt", "a+") as file: file.write("--- mlp_dim " + str(param) + ", " + str(round(starttime)) + 70 * "-" + "\n") for epoch in range(start_epoch, n_epochs + start_epoch): if epoch == 0: # warmup optimizer = Lamb(model.parameters(), lr=1e-5, betas=betas) if epoch == 1: for g in optimizer.param_groups: g["lr"] = 1e-3 epochstart = dt.now().timestamp() total_loss = 0 acc_train = 0 acc_test = 0 # Training model.train() for img, labels in dataloader_train: #img, labels = batch img, labels = img.to(device), labels.to(device) #print(labels[0]) #labelsmat = F.one_hot(labels, num_classes=10).to(device)
"BATCH_SIZE_PER_GPU", 5)), # batch size for evaluation warmup_steps=int(os.environ.get("WARMUP_NUM_STEPS", 5)), save_steps=int(os.environ.get("STEPS_PER_SAVE", 1000000)), logging_steps=int(os.environ.get( "STEPS_PER_PRINT", 1)), # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./tensorboard', # directory for storing logs evaluation_strategy=EvaluationStrategy.STEPS, eval_steps=int(os.environ.get("STEPS_PER_SAVE", int(50 / 5))), save_total_limit=5, prediction_loss_only=True, gradient_accumulation_steps=int( os.environ.get("GRADIENT_ACCUMULATION_STEPS", 1)), max_grad_norm=0.5) optimizer = Lamb(auto_encoder.parameters(), float(os.environ.get("LEARNING_RATE", 5e-3))) trainer = CustomTrainer( model=auto_encoder, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=test_dataset, # evaluation dataset data_collator=data_collector_huggingface, optimizers=(optimizer, None)) start_time = time.time() print(f"Starttime {datetime.now()}") output = trainer.train() print("Running final evaluation") trainer.evaluate(eval_dataset) print(f"Endtime {datetime.now()}") end_time = time.time()
print("\nDiscriminator:") print(f"{sum(p.numel() for p in netD.parameters())} parameters") print(f"{len(list(netD.parameters()))} tensors") print(args.optim) if args.optim == 'adam': optG = optim.Adam(netG.parameters(), lr=args.initial_lr, betas=(0.0, 0.999)) optD = optim.Adam(netD.parameters(), lr=args.initial_lr, betas=(0.9, 0.999)) elif args.optim == 'lamb': optG = Lamb(netG.parameters(), lr=args.initial_lr, betas=(0.0, 0.999)) optD = Lamb(netD.parameters(), lr=args.initial_lr, betas=(0.0, 0.999)) elif args.optim == 'sgd': optG = optim.SGD(netG.parameters(), lr=args.initial_lr, momentum=0.0) optD = optim.SGD(netD.parameters(), lr=args.initial_lr, momentum=0.0) elif args.optim == 'nero': optG = Nero(netG.parameters(), lr=args.initial_lr) optD = Nero(netD.parameters(), lr=args.initial_lr) else: raise Exception("Unsupported optim") ######################################### #### Train ##############################
def train(args, model, tokenizer, shuffled_fh, train_fn, configObj, logger): """ Train the model """ #if args.local_rank in [-1, 0]: tb_writer = None if is_first_worker(): tb_writer = SummaryWriter(log_dir=args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1) total_train_steps = len( shuffled_fh) * args.num_train_epochs // real_batch_size if args.warmup_steps <= 0: args.warmup_steps = int(total_train_steps * args.warmup_proportion) if args.max_steps > 0: t_total = args.max_steps #args.num_train_epochs = args.max_steps // (args.expected_train_size // args.gradient_accumulation_steps) + 1 else: # t_total = args.expected_train_size // real_batch_size * args.num_train_epochs t_total = total_train_steps args.max_steps = total_train_steps # layerwise optimization for lamb optimizer_grouped_parameters = [] no_decay = ["bias", "LayerNorm.weight", "layer_norm", "LayerNorm"] layer_optim_params = set() for layer_name in [ "bert.embeddings", "score_out", "downsample1", "downsample2", "downsample3", "embeddingHead" ]: layer = getattr_recursive(model, layer_name) if layer is not None: optimizer_grouped_parameters.append({"params": layer.parameters()}) for p in layer.parameters(): layer_optim_params.add(p) if getattr_recursive(model, "bert.encoder.layer") is not None: for layer in model.bert.encoder.layer: optimizer_grouped_parameters.append({"params": layer.parameters()}) for p in layer.parameters(): layer_optim_params.add(p) optimizer_grouped_parameters.append({ "params": [p for p in model.parameters() if p not in layer_optim_params] }) if len(optimizer_grouped_parameters) == 0: optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01 }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] logger.info("len(optimizer_grouped_parameters): {}".format( len(optimizer_grouped_parameters))) # 1 if args.optimizer.lower() == "lamb": optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer.lower() == "adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: raise Exception( "optimizer {0} not recognized! Can only be lamb or adamW".format( args.optimizer)) if args.scheduler.lower() == "linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) elif args.scheduler.lower() == "cosine": scheduler = CosineAnnealingLR(optimizer, t_total, 1e-8) else: raise Exception( "Scheduler {0} not recognized! Can only be linear or cosine". format(args.scheduler)) # Check if saved optimizer or scheduler states exist # TODO: we find this consume huge amount of additional GPU memory with pytorch, thus disable for now # if os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) and args.resume: # Load in optimizer and scheduler states # if is_first_worker(): # op_state = torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")) # print([len(x['params']) for x in op_state['param_groups']]) # real_op_state = optimizer.state_dict() # print([len(x['params']) for x in real_op_state['param_groups']]) # optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) # scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Train dataset size = %d", len(shuffled_fh)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", real_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" LR warmup steps = %d", args.warmup_steps) global_step = 0 eval_cnt = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if (os.path.exists(args.model_name_or_path) and args.resume) or args.starting_step > 0: # set global_step to gobal_step of last saved checkpoint from model path try: global_step = args.starting_step if global_step <= 0: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (args.expected_train_size // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( args.expected_train_size // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except: logger.info(" Start training from a pretrained model") tr_loss = 0.0 tensorboard_scalars = {} model.zero_grad() eval_cfg = args.eval_configObj # this is also produced in the load_model_config() method eval_fn = wrapped_process_fn(tokenizer, args, eval_cfg) ideal_path = args.eval_ideal_path is_first_eval = (eval_cnt == 0) best_checkpoints = [] set_seed(args) # Added here for reproductibility train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) for m_epoch in train_iterator: # shuffle input after first epoch if m_epoch > 0: shuffled_fh.change_seed(m_epoch) sds = SimplifiedStreamingDataset(shuffled_fh, train_fn, configObj.ix_func) train_dataloader = DataLoader(sds, batch_size=args.per_gpu_train_batch_size, num_workers=4, pin_memory=True) acc_accum = [] model.train() for step, batch in tqdm(enumerate(train_dataloader), desc="Iteration", disable=args.local_rank not in [-1, 0]): if step % 100 == 0 and step > 0: logger.info('train_step: {}'.format(step)) # Skip past any already trained steps if resuming training # if steps_trained_in_current_epoch > 0: # steps_trained_in_current_epoch -= 1 # continue batch = tuple(t.to(args.device) for t in batch) inputs = { "query_ids": batch[0].long(), "query_attn_mask": batch[1].long(), "meta_ids": batch[3].long(), "meta_attn_mask": batch[4].long(), "labels": batch[6].float() } # sync gradients only at gradient accumulation step if (step + 1) % args.gradient_accumulation_steps == 0: outputs = model(**inputs) else: with model.no_sync(): outputs = model(**inputs) loss_combine = outputs[0] # assert len(loss_combine) == 3 loss = loss_combine["Loss/total_loss"] sim_combine = outputs[1] # assert len(sim_combine) == 8 acc = outputs[2] acc_accum.append(acc.item()) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: if (step + 1) % args.gradient_accumulation_steps == 0: loss.backward() else: with model.no_sync(): loss.backward() tr_loss += loss.item() # if is_first_worker(): # print("unique labels: ", torch.unique(inputs["labels"]).int()) # print("Similarity combinations: ", sim_combine) for key, value in loss_combine.items(): tensorboard_scalars[key] = tensorboard_scalars.setdefault( key, 0.0) + value.item() for key, value in sim_combine.items(): # print(f"{key}: {value.mean().item()}") value = value.mean() value[value != value] = 0 tensorboard_scalars[key] = tensorboard_scalars.setdefault( key, 0.0) + value.item() # print(f"tensorboardscalars: {key} : {tensorboard_scalars[key]}") if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: #for key, value in tensorboard_scalars.items(): # print(f"{key}: {value}") if args.evaluate_during_training and global_step % ( args.logging_steps_per_eval * args.logging_steps) == 0: if is_first_worker(): save_checkpoint(args, -1, model, tokenizer, logger=logger) model.eval() is_first_eval = (eval_cnt == 0) args.global_step = global_step init_time = time() fidelity = eval_fidelity(args, model, eval_fn, eval_cfg.path, ideal_path, args.cache_dir, is_first_eval, args.eval_full, logger) logger.info("Eval cost time: {}".format(time() - init_time)) eval_cnt += 1 model.train() if is_first_worker(): if len(best_checkpoints) < 3: save_checkpoint(args, global_step, model, tokenizer, optimizer, scheduler, logger=logger) best_checkpoints.append( (global_step, fidelity)) else: worst_checkpoint = sorted( best_checkpoints, key=lambda x: x[1])[0] if fidelity > worst_checkpoint[1]: save_checkpoint(args, global_step, model, tokenizer, optimizer, scheduler, logger=logger) worst_cp_path = os.path.join( args.output_dir, "checkpoint-{}".format( str(worst_checkpoint[0]))) shutil.rmtree(worst_cp_path) best_checkpoints.remove(worst_checkpoint) best_checkpoints.append( (global_step, fidelity)) else: logger.info("Fidelity not in top 3!") assert len(best_checkpoints) == 3 tb_writer.add_scalar("fidelity", fidelity, global_step) logger.info("Fidelity: {0}".format(fidelity)) dist.barrier() learning_rate_scalar = scheduler.get_lr()[0] avg_acc = sum(acc_accum) * 1.0 / len(acc_accum) logger.info("Train acc: {}".format(avg_acc)) if is_first_worker(): tb_writer.add_scalar("Training/learning_rate", learning_rate_scalar, global_step) tb_writer.add_scalar("Training/epoch", m_epoch, global_step) tb_writer.add_scalar("Training/accuracy", avg_acc, global_step) for key, value in tensorboard_scalars.items(): tb_writer.add_scalar(key, value / args.logging_steps, global_step) logger.info( json.dumps({ **tensorboard_scalars, **{ "learning_rate": learning_rate_scalar, "Accuracy": avg_acc, "step": global_step } })) tensorboard_scalars = {} dist.barrier() if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1 or torch.distributed.get_rank() == 0: tb_writer.close() return global_step, tr_loss / global_step
elif sys.argv[1] == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) elif sys.argv[1] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=0.01) elif sys.argv[1] == 'sgdwm': optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) elif sys.argv[1] == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.9) elif sys.argv[1] == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=0.01) elif sys.argv[1] == 'radam': optimizer = RAdam(model.parameters()) elif sys.argv[1] == 'lars': #no tensorboardX optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9) elif sys.argv[1] == 'lamb': optimizer = Lamb(model.parameters()) elif sys.argv[1] == 'novograd': optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001) schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader), 1e-4) def train(train_loader, model, criterion, optimizer, schedular, device): ''' Function for the training step of the training loop ''' model.train() running_loss = 0 for X, y_true in train_loader:
def train(args, model, tokenizer, train_dataloader): """ Train the model """ #if args.local_rank in [-1, 0]: tb_writer = None if is_first_worker(): tb_writer = SummaryWriter(log_dir=args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1) if args.max_steps > 0: t_total = args.max_steps #args.num_train_epochs = args.max_steps // (args.expected_train_size // args.gradient_accumulation_steps) + 1 else: t_total = args.expected_train_size // real_batch_size * args.num_train_epochs # layerwise optimization for lamb optimizer_grouped_parameters = [] layer_optim_params = set() for layer_name in ["roberta.embeddings", "score_out", "downsample1", "downsample2", "downsample3", "embeddingHead"]: layer = getattr_recursive(model, layer_name) if layer is not None: optimizer_grouped_parameters.append({"params": layer.parameters()}) for p in layer.parameters(): layer_optim_params.add(p) if getattr_recursive(model, "roberta.encoder.layer") is not None: for layer in model.roberta.encoder.layer: optimizer_grouped_parameters.append({"params": layer.parameters()}) for p in layer.parameters(): layer_optim_params.add(p) optimizer_grouped_parameters.append({"params": [p for p in model.parameters() if p not in layer_optim_params]}) if len(optimizer_grouped_parameters)==0: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] if args.optimizer.lower()=="lamb": optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer.lower()=="adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: raise Exception("optimizer {0} not recognized! Can only be lamb or adamW".format(args.optimizer)) if args.scheduler.lower()=="linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) elif args.scheduler.lower()=="cosine": scheduler = CosineAnnealingLR(optimizer, t_total, 1e-8) else: raise Exception("Scheduler {0} not recognized! Can only be linear or cosine".format(args.scheduler)) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ) and args.load_optimizer_scheduler: # Load in optimizer and scheduler states # if is_first_worker(): # op_state = torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")) # print([len(x['params']) for x in op_state['param_groups']]) # real_op_state = optimizer.state_dict() # print([len(x['params']) for x in real_op_state['param_groups']]) optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") #logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (args.expected_train_size // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (args.expected_train_size // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except: logger.info(" Start training from a pretrained model") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for m_epoch in train_iterator: #epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in tqdm(enumerate(train_dataloader), desc="Iteration", disable=args.local_rank not in [-1, 0]): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device).long() for t in batch) if (step + 1) % args.gradient_accumulation_steps == 0: outputs = model(*batch) else: with model.no_sync(): outputs = model(*batch) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: if (step + 1) % args.gradient_accumulation_steps == 0: loss.backward() else: with model.no_sync(): loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if is_first_worker() and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) dist.barrier() if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if args.evaluate_during_training and global_step % (args.logging_steps_per_eval*args.logging_steps)==0: model.eval() reranking_mrr, full_ranking_mrr = passage_dist_eval(args, model, tokenizer) if is_first_worker(): print("Reranking/Full ranking mrr: {0}/{1}".format(str(reranking_mrr), str(full_ranking_mrr))) mrr_dict = {"reranking": float(reranking_mrr), "full_raking": float(full_ranking_mrr)} tb_writer.add_scalars("mrr", mrr_dict, global_step) print(args.output_dir) loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss if is_first_worker(): for key, value in logs.items(): print(key, type(value)) tb_writer.add_scalar(key, value, global_step) tb_writer.add_scalar("epoch", m_epoch, global_step) print(json.dumps({**logs, **{"step": global_step}})) dist.barrier() if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1 or torch.distributed.get_rank() == 0: tb_writer.close() return global_step, tr_loss / global_step
def train(args, model, tokenizer, query_cache, passage_cache): """ Train the model """ #if args.local_rank in [-1, 0]: tb_writer = None if is_first_worker(): tb_writer = SummaryWriter(log_dir=args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1) # layerwise optimization for lamb optimizer_grouped_parameters = [] no_decay = ["bias", "w", "b", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] if args.optimizer.lower() == "lamb": optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer.lower() == "adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: raise Exception( "optimizer {0} not recognized! Can only be lamb or adamW".format( args.optimizer)) # Check if saved optimizer or scheduler states exist if os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and args.load_optimizer_scheduler: # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") #logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Max steps = %d", args.max_steps) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) global_step = 0 eval_cnt = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path if "-" in args.model_name_or_path: try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except: global_step = 0 else: global_step = 0 logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from global step %d", global_step) tr_loss = 0.0 model.zero_grad() model.train() set_seed(args) # Added here for reproductibility last_ann_no = -1 train_dataloader = None train_dataloader_iter = None dev_ndcg = 0 step = 0 eval_path = os.path.join(args.data_dir, "eval_full.tsv") eval_cfg = L1InputConfig("eval", model, None, eval_path, args.configObj.chunk_cfg, qid=0, docid=1, query=4, title=5, anchor=6, url=7, click=8, desc=9, rating=10, market=12, lang=13) def eval_fn(line, i): return L1_process_fn(line, i, tokenizer, args, eval_cfg.map, eval_cfg.chunk_cfg) model.eval() ideal_path = os.path.join(args.data_dir, "ideal_map_UN.tsv") is_first_eval = (eval_cnt == 0) args.global_step = global_step # fidelity = eval_fidelity(args, model, eval_fn, eval_cfg.path, ideal_path, args.data_cache_dir, is_first_eval) # eval_cnt+=1 # print("Fidelity: {0}".format(fidelity)) # if is_first_worker(): # tb_writer.add_scalar("fidelity", fidelity, global_step) best_checkpoints = [] acc_accum = [] scheduler = None while global_step < args.max_steps: if step % args.gradient_accumulation_steps == 0 and global_step % args.logging_steps == 0: # check if new ann training data is availabe ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir) while ann_no == -1 or (ann_path is not None and ann_no != last_ann_no): try: logger.info("Training on new add data at %s", ann_path) with open(ann_path, 'r') as f: ann_training_data = f.readlines() dev_ndcg = ndcg_json['ndcg'] ann_checkpoint_path = ndcg_json['checkpoint'] ann_checkpoint_no = get_checkpoint_no(ann_checkpoint_path) aligned_size = (len(ann_training_data) // args.world_size) * args.world_size ann_training_data = ann_training_data[:aligned_size] logger.info("Total ann queries: %d", len(ann_training_data)) if len(ann_training_data) == 0: time.sleep(300) continue train_dataset = SimplifiedStreamingDataset( ann_training_data, args.configObj.process_fn(args, query_cache, passage_cache)) train_dataloader = DataLoader( train_dataset, batch_size=args.train_batch_size, num_workers=1) train_dataloader_iter = iter(train_dataloader) # re-warmup if scheduler is None: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps) if args.local_rank != -1: dist.barrier() if is_first_worker(): # add ndcg at checkpoint step used instead of current step # tb_writer.add_scalar("dev_ndcg", dev_ndcg, ann_checkpoint_no) if last_ann_no != -1: tb_writer.add_scalar("epoch", last_ann_no, global_step - 1) tb_writer.add_scalar("epoch", ann_no, global_step) last_ann_no = ann_no break except: if is_first_worker(): print("wait") time.sleep(300) ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir) try: batch = next(train_dataloader_iter) except StopIteration: logger.info("Finished iterating current dataset, begin reiterate") train_dataloader_iter = iter(train_dataloader) batch = next(train_dataloader_iter) batch = tuple(t.to(args.device).long() for t in batch) step += 1 model.train() # if args.triplet: # inputs = {"query_ids": batch[0].long(), "attention_mask_q": batch[1].long(), # "input_ids_a": batch[3].long(), "attention_mask_a": batch[4].long(), # "input_ids_b": batch[6].long(), "attention_mask_b": batch[7].long()} # else: # inputs = {"input_ids_a": batch[0].long(), "attention_mask_a": batch[1].long(), # "input_ids_b": batch[3].long(), "attention_mask_b": batch[4].long(), # "labels": batch[6]} # sync gradients only at gradient accumulation step if step % args.gradient_accumulation_steps == 0: outputs = model(*batch) else: with model.no_sync(): outputs = model(*batch) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) acc = outputs[2] acc_accum.append(acc.item()) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: if step % args.gradient_accumulation_steps == 0: loss.backward() else: with model.no_sync(): loss.backward() tr_loss += loss.item() if step % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # print("w grad:", model.module.w.grad) # print("w:", model.module.w) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if global_step % (args.logging_steps_per_eval * args.logging_steps) == 0: print("Train acc:", sum(acc_accum) * 1.0 / len(acc_accum)) acc_accum = [] model.eval() is_first_eval = (eval_cnt == 0) args.global_step = global_step fidelity = eval_fidelity(args, model, eval_fn, eval_cfg.path, ideal_path, args.data_cache_dir, is_first_eval) eval_cnt += 1 if is_first_worker(): if len(best_checkpoints) < 10: save_checkpoint(args, global_step, model, tokenizer, optimizer, scheduler) best_checkpoints.append((global_step, fidelity)) else: worst_checkpoint = sorted(best_checkpoints, key=lambda x: x[1])[0] if fidelity > worst_checkpoint[1]: save_checkpoint(args, global_step, model, tokenizer, optimizer, scheduler) worst_cp_path = os.path.join( args.output_dir, "checkpoint-{}".format( str(worst_checkpoint[0]))) shutil.rmtree(worst_cp_path) best_checkpoints.remove(worst_checkpoint) best_checkpoints.append( (global_step, fidelity)) else: print("Fidelity not in top 10!") assert len(best_checkpoints) == 10 save_checkpoint(args, -1, model, tokenizer) print("Fidelity: {0}".format(fidelity)) logs["fidelity"] = fidelity dist.barrier() loss_scalar = tr_loss / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar tr_loss = 0 if is_first_worker(): for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) logger.info(json.dumps({**logs, **{"step": global_step}})) if args.local_rank == -1 or torch.distributed.get_rank() == 0: tb_writer.close() return global_step
optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars':#no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'dyna': from dyna import Dyna optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) batch_acumulate = args.batch_size//256 batch_per_step = len(trainloader)//batch_acumulate+int(len(trainloader)%batch_acumulate>0)
def worker_fn(rank, world_size): setup(rank, world_size) weights_filename = "weights.pt" batch_size = 512 epochs = 240 warmup_epochs = 8 use_mixed_precision = True batch_size = batch_size // world_size #batch size per worker #Data all_data = os.listdir(datapath_preprocessed) train_filenames = [p for p in all_data if re.match(r'^PGM_' + re.escape(dataset_name) + r'_train_(\d+)\.npz$', p) is not None] val_filenames = [p for p in all_data if re.match(r'^PGM_' + re.escape(dataset_name) + r'_val_(\d+)\.npz$', p) is not None] train_dataset = PgmDataset(train_filenames) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=8, pin_memory=False, sampler=train_sampler)#shuffle is done by the sampler val_dataloader = DataLoader(PgmDataset(val_filenames), batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) #Model device_ids = [rank] model = WReN(2).to(device_ids[0])#3-layer MLRN if weights_filename is not None and os.path.isfile("./" + weights_filename): model.load_state_dict(torch.load(weights_filename, map_location='cpu')) print('Weights loaded') cold_start = False else: print('No weights found') cold_start = True #Loss and optimizer final_lr = 2e-3 def add_module_params_with_decay(module, weight_decay, param_groups):#adds parameters with decay unless they are bias parameters, which shouldn't receive decay group_with_decay = [] group_without_decay = [] for name, param in module.named_parameters(): if not param.requires_grad: continue if name == 'bias' or name.endswith('bias'): group_without_decay.append(param) else: group_with_decay.append(param) param_groups.append({"params": group_with_decay, "weight_decay": weight_decay}) param_groups.append({"params": group_without_decay}) optimizer_param_groups = [ ] add_module_params_with_decay(model.conv, 2e-1, optimizer_param_groups) add_module_params_with_decay(model.post_cnn_linear, 2e-1, optimizer_param_groups) add_module_params_with_decay(model.g, 2e-1, optimizer_param_groups) add_module_params_with_decay(model.h, 2e-1, optimizer_param_groups) add_module_params_with_decay(model.f, 2e-1, optimizer_param_groups) add_module_params_with_decay(model.f_final, 2e-1, optimizer_param_groups) optimizer = Lamb(optimizer_param_groups, lr=final_lr) base_model = model if use_mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") #Mixed Precision lossFunc = torch.nn.CrossEntropyLoss() softmax = torch.nn.Softmax(dim=1) #Parallel distributed model device = device_ids[0] torch.cuda.set_device(device) parallel_model = torch.nn.parallel.DistributedDataParallel(model, device_ids) if rank == 0: #accuracy logging sess = tf.Session() train_acc_placeholder = tf.placeholder(tf.float32, shape=()) train_acc_summary = tf.summary.scalar('training_acc', train_acc_placeholder) val_acc_placeholder = tf.placeholder(tf.float32, shape=()) val_acc_summary = tf.summary.scalar('validation_acc', val_acc_placeholder) writer = tf.summary.FileWriter("log", sess.graph) #training loop acc = [] global_step = 0 for epoch in range(epochs): train_sampler.set_epoch(epoch) # Validation val_acc = [] parallel_model.eval() with torch.no_grad(): for i, (local_batch, local_labels) in enumerate(val_dataloader): local_batch, targets = local_batch.to(device), local_labels.to(device) #answer = model(local_batch.type(torch.float32)) answer, _ = parallel_model(local_batch.type(torch.float32)) #Calc accuracy answerSoftmax = softmax(answer) maxIndex = answerSoftmax.argmax(dim=1) correct = maxIndex.eq(targets) accuracy = correct.type(dtype=torch.float16).mean(dim=0) val_acc.append(accuracy) if i % 50 == 0 and rank == 0: print("batch " + str(i)) total_val_acc = sum(val_acc) / len(val_acc) print('Validation accuracy: ' + str(total_val_acc.item())) if rank == 0: summary = sess.run(val_acc_summary, feed_dict={val_acc_placeholder: total_val_acc.item()}) writer.add_summary(summary, global_step=global_step) # Training parallel_model.train() for i, (local_batch, local_labels) in enumerate(train_dataloader): global_step = global_step + 1 if cold_start and epoch < warmup_epochs:#linear scaling of the lr for warmup during the first few epochs lr = final_lr * global_step / (warmup_epochs*len(train_dataset) / (batch_size * world_size)) for param_group in optimizer.param_groups: param_group['lr'] = lr local_batch, targets = local_batch.to(device_ids[0]), local_labels.to(device_ids[0]) optimizer.zero_grad() answer, activation_loss = parallel_model(local_batch.type(torch.float32)) loss = lossFunc(answer, targets) + activation_loss * 2e-3 #Calc accuracy answerSoftmax = softmax(answer) maxIndex = answerSoftmax.argmax(dim=1) correct = maxIndex.eq(targets) accuracy = correct.type(dtype=torch.float16).mean(dim=0) acc.append(accuracy) #Training step if use_mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: #Mixed precision scaled_loss.backward() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(parallel_model.parameters(), 1e1) optimizer.step() if i % 50 == 0 and rank == 0: print("epoch " + str(epoch) + " batch " + str(i)) print("loss", loss) print("activation loss", activation_loss) print(grad_norm) #logging and saving weights if i % 1000 == 999: trainAcc = sum(acc) / len(acc) acc = [] print('Training accuracy: ' + str(trainAcc.item())) if rank == 0: if weights_filename is not None: torch.save(base_model.state_dict(), weights_filename) print('Weights saved') summary = sess.run(train_acc_summary, feed_dict={train_acc_placeholder: trainAcc.item()}) writer.add_summary(summary, global_step=global_step) if cold_start and weights_filename is not None and epoch % 10 == 0 and rank == 0: torch.save(base_model.state_dict(), weights_filename + "_cp" + str(epoch)) print('Checkpoint saved') cleanup()
from lamb import Lamb # Load phase and group velocity data from an Excel file exported from # Dispersion software (1 mm aluminum plate) df_vp = pd.read_excel('alum1mm.xlsx', skiprows = 9, usecols = 'U:AN', header=None) df_vg = pd.read_excel('alum1mm.xlsx', skiprows = 9, usecols = 'A:T', header=None) # Create an instance of the same material using the Lamb class. alum = Lamb(thickness=1, nmodes_sym=5, nmodes_antisym=5, fd_max=10000, vp_max=15000, c_L=6420, c_S=3040) # Plot phase velocity using the Lamb class. fig1, ax1 = alum.plot_phase_velocity(material_velocities=False, cutoff_frequencies=False, sym_style={'color' : 'black'}, antisym_style={'color' : 'black'}) # Remove the legend that labels Symmetric and Antisymmetric modes # (we are interested in labeling only Lamb module and Dispersion). ax1.get_legend().remove()
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.NLLLoss() if args.optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) elif args.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.momentum, args.beta)) elif args.optim == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr, betas=(args.momentum, args.beta)) elif args.optim == 'nero': optimizer = Nero(model.parameters(), lr=args.lr) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else:
E = 68.9e9 # E = Young's modulus, in Pa. p = 2700 # p = Density (rho), in kg/m3. v = 0.33 # v = Poisson's ratio (nu). c_L = np.sqrt(E * (1 - v) / (p * (1 + v) * (1 - 2 * v))) c_S = np.sqrt(E / (2 * p * (1 + v))) c_R = c_S * ((0.862 + 1.14 * v) / (1 + v)) # Example: A 10 mm aluminum plate. alum = Lamb(thickness=10, nmodes_sym=5, nmodes_antisym=5, fd_max=10000, vp_max=15000, c_L=c_L, c_S=c_S, c_R=c_R, material='Aluminum') # Plot phase velocity, group velocity and wavenumber. alum.plot_phase_velocity() alum.plot_group_velocity() alum.plot_wave_number() # Plot wave structure (displacement profiles across thickness) for A0 # and S0 modes at different fd values. alum.plot_wave_structure(mode='A0',
weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) lr_scheduler = [ optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader), 1e-4) ] else: optimizer = optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay)