def val_dataloader(self) -> DataLoader: dataloader, n_samples = create_dataloader( self.hyperparams.val_data_path, self.hyperparams.max_context, False, False, self.hyperparams.test_batch_size, self.num_workers, ) print( f"approximate number of steps for val is {ceil(n_samples / self.hyperparams.test_batch_size)}" ) return dataloader
def train_dataloader(self) -> DataLoader: dataloader, n_samples = create_dataloader( self.hyperparams.train_data_path, self.hyperparams.max_context, self.hyperparams.random_context, self.hyperparams.shuffle_data, self.hyperparams.batch_size, self.num_workers, ) print( f"approximate number of steps for train is {ceil(n_samples / self.hyperparams.batch_size)}" ) return dataloader
def run(self): # prepare dataloader (iterable) print('Start loading data...') self.data_loader = create_dataloader(self.args) print('...done') # iterator from dataloader iterator = iter(self.data_loader) iter_per_epoch = len(iterator) self.Z = [] self.GT = [] #----# prn_str = 'Start going through the entire data...' print(prn_str) self.dump_to_record(prn_str) for iteration in range(1, 100000000): # reset data iterators for each epoch if iteration > 1 and (iteration - 1) % iter_per_epoch == 0: break with torch.no_grad(): # sample a mini-batch X, ids = next(iterator) # (n x C x H x W) if self.use_cuda: X = X.cuda() ids = ids.cuda() # enc(X) mu, std, logvar = self.encoder(X) self.Z.append(mu.cpu().detach().numpy()) ids = ids.cpu().detach().numpy() self.GT.append(self.latent_values[ids, :]) prn_str = 'batch iter = %d / %d done' % (iteration, iter_per_epoch) print(prn_str) self.dump_to_record(prn_str) self.Z = np.vstack(self.Z) self.GT = np.vstack(self.GT) np.savez(self.latent_file, name1=self.Z, name2=self.GT)
def evaluate(checkpoint: str, data: str = None): seed_everything(SEED) model = Code2Seq.load_from_checkpoint(checkpoint_path=checkpoint) gpu = 1 if torch.cuda.is_available() else None trainer = Trainer(gpus=gpu) if data is not None: data_loader, n_samples = create_dataloader( join(DATA_FOLDER, data), model.config.max_context, False, False, model.config.test_batch_size, cpu_count()) print( f"approximate number of steps for test is {ceil(n_samples / model.config.test_batch_size)}" ) trainer.test(model, test_dataloaders=data_loader) else: trainer.test(model)
def train_distributed(replica_id, replica_count, port, args, params): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = str(port) torch.distributed.init_process_group("nccl", rank=replica_id, world_size=replica_count) device = torch.device("cuda", replica_id) torch.cuda.set_device(device) model = NUWave(params).to(device) model = DistributedDataParallel(model, device_ids=[replica_id]) _train_impl( replica_id, model, create_dataloader(params, True, is_distributed=True), args, params, )
def test(self): ones = torch.ones(self.batch_size, dtype=torch.long) zeros = torch.zeros(self.batch_size, dtype=torch.long) if self.use_cuda: ones = ones.cuda() zeros = zeros.cuda() # prepare dataloader (iterable) print('Start loading data...') self.data_loader = create_dataloader(self.args) print('...done') # latent traversal prn_str = 'Start doing refined latent traversal...' print(prn_str) self.dump_to_record(prn_str) self.save_refined_traverse(self.ckpt_load_iter)
def evaluate(checkpoint: str, data: str = None, batch_size: int = None): seed_everything(SEED) model = Code2Seq.load_from_checkpoint(checkpoint_path=checkpoint) batch_size = batch_size or model.hyperparams.test_batch_size data = data or model.hyperparams.test_data_path gpu = 1 if torch.cuda.is_available() else None trainer = Trainer(gpus=gpu) data_loader, n_samples = create_dataloader( data, model.hyperparams.max_context, False, False, batch_size, cpu_count(), ) print( f"approximate number of steps for test is {ceil(n_samples / batch_size)}" ) trainer.test(model, test_dataloaders=data_loader)
def train(rank, nprocs, args): print(rank) torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', rank=rank, world_size=args.world_size) # seed for reproducibility torch.manual_seed(1) torch.cuda.manual_seed(1) torch.backends.cudnn.deterministic = True # create dataset. #train_loader, test_loader = partition_dataset(rank, args.world_size, args) train_loader, test_loader, train_sampler = create_dataloader( '../data', args.world_size, args.batch_size) print("loading dataset successed!") # create model. model = resnet18() torch.cuda.set_device(rank) model.cuda(rank) cudnn.benchmark = True # define the optimizer. #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) optimizer = LocalSGD(model.parameters(), lr=args.lr, gmf=0, tau=args.tau, size=args.world_size, momentum=0.9, nesterov=True, weight_decay=1e-4) # define the criterion and lr scheduler. criterion = nn.CrossEntropyLoss().cuda() for epoch in range(args.epoches): acc = train_one_epoch(model, optimizer, criterion, train_loader, test_loader, epoch, rank) print(acc) break
def main(): # parsing args args = parser.parse_args() # gpu setup device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setup model print('load pretrained model...') model = create_model(args) state = torch.load(args.model_path, map_location='cpu')['model'] model.load_state_dict(state, strict=False) print('set up the model for fine-grained task...') for param in model.parameters(): param.requires_grad = False # freeze the convnet num_in_fc = model.num_features model.head.fc = nn.Linear(num_in_fc, args.transfer_num_classes) model = model.to(device) print('done\n') # setup data loader print('creating data loader...') train_loader = create_dataloader(args) print('done\n') # training process print('start training...') # only train the fc print('train only fc layer:') model = train(model, train_loader, args, device, stage='only_fc') # train the entire network print('train entire network:') for param in model.parameters(): param.requires_grad = True train(model, train_loader, args, device, stage='full_net') print('finish training\n')
def train(self): self.set_mode(train=True) ones = torch.ones(self.batch_size, dtype=torch.long) zeros = torch.zeros(self.batch_size, dtype=torch.long) if self.use_cuda: ones = ones.cuda() zeros = zeros.cuda() # prepare dataloader (iterable) print('Start loading data...') self.data_loader = create_dataloader(self.args) print('...done') # iterators from dataloader iterator1 = iter(self.data_loader) iterator2 = iter(self.data_loader) iter_per_epoch = min(len(iterator1), len(iterator2)) start_iter = self.ckpt_load_iter + 1 epoch = int(start_iter / iter_per_epoch) for iteration in range(start_iter, self.max_iter + 1): # reset data iterators for each epoch if iteration % iter_per_epoch == 0: print('==== epoch %d done ====' % epoch) epoch += 1 iterator1 = iter(self.data_loader) iterator2 = iter(self.data_loader) #============================================ # TRAIN THE VAE (ENC & DEC) #============================================ # sample a mini-batch X, ids = next(iterator1) # (n x C x H x W) if self.use_cuda: X = X.cuda() # enc(X) mu, std, logvar = self.encoder(X) # prior alpha params a, b = self.prior_alpha() # posterior alpha params ah, bh = self.post_alpha() # kl loss kls = 0.5 * ( \ (ah/bh)*(mu**2+std**2) - 1.0 + \ bh.log() - ah.digamma() - logvar ) # (n x z_dim) loss_kl = kls.sum(1).mean() # kl loss on alpha kls_alpha = ( \ (ah-a)*ah.digamma() - ah.lgamma() + a.lgamma() + \ a*(bh.log()-b.log()) + (ah/bh)*(b-bh) ) # z_dim-dim loss_kl_alpha = kls_alpha.sum() / self.N # reparam'ed samples if self.use_cuda: Eps = torch.cuda.FloatTensor(mu.shape).normal_() else: Eps = torch.randn(mu.shape) Z = mu + Eps * std # dec(Z) X_recon = self.decoder(Z) # recon loss loss_recon = F.binary_cross_entropy_with_logits( X_recon, X, reduction='sum').div(X.size(0)) # dis(Z) DZ = self.D(Z) # tc loss loss_tc = (DZ[:, 0] - DZ[:, 1]).mean() # total loss for vae vae_loss = loss_recon + loss_kl + loss_kl_alpha + \ self.gamma*loss_tc # update vae self.optim_vae.zero_grad() vae_loss.backward() self.optim_vae.step() #============================================ # TRAIN THE DISCRIMINATOR #============================================ # sample a mini-batch X2, ids = next(iterator2) # (n x C x H x W) if self.use_cuda: X2 = X2.cuda() # enc(X2) mu, std, _ = self.encoder(X2) # reparam'ed samples if self.use_cuda: Eps = torch.cuda.FloatTensor(mu.shape).normal_() else: Eps = torch.randn(mu.shape) Z = mu + Eps * std # dis(Z) DZ = self.D(Z) # dim-wise permutated Z over the mini-batch perm_Z = [] for zj in Z.split(1, 1): idx = torch.randperm(Z.size(0)) perm_zj = zj[idx] perm_Z.append(perm_zj) Z_perm = torch.cat(perm_Z, 1) Z_perm = Z_perm.detach() # dis(Z_perm) DZ_perm = self.D(Z_perm) # discriminator loss dis_loss = 0.5 * (F.cross_entropy(DZ, zeros) + F.cross_entropy(DZ_perm, ones)) # update discriminator self.optim_dis.zero_grad() dis_loss.backward() self.optim_dis.step() ########################################## # print the losses if iteration % self.print_iter == 0: prn_str = ( '[iter %d (epoch %d)] vae_loss: %.3f | ' + \ 'dis_loss: %.3f\n ' + \ '(recon: %.3f, kl: %.3f, kl_alpha: %.3f, tc: %.3f)' \ ) % \ ( iteration, epoch, vae_loss.item(), dis_loss.item(), loss_recon.item(), loss_kl.item(), loss_kl_alpha.item(), loss_tc.item() ) prn_str += '\n a = {}'.format( a.detach().cpu().numpy().round(2)) prn_str += '\n b = {}'.format( b.detach().cpu().numpy().round(2)) prn_str += '\n ah = {}'.format( ah.detach().cpu().numpy().round(2)) prn_str += '\n bh = {}'.format( bh.detach().cpu().numpy().round(2)) print(prn_str) if self.record_file: record = open(self.record_file, 'a') record.write('%s\n' % (prn_str, )) record.close() # save model parameters if iteration % self.ckpt_save_iter == 0: self.save_checkpoint(iteration) # save output images (recon, synth, etc.) if iteration % self.output_save_iter == 0: # 1) save the recon images self.save_recon(iteration, X, torch.sigmoid(X_recon).data) # 2) save the synth images self.save_synth(iteration, howmany=100) # 3) save the latent traversed images if self.dataset.lower() == '3dchairs': self.save_traverse(iteration, limb=-2, limu=2, inter=0.5) else: self.save_traverse(iteration, limb=-3, limu=3, inter=0.1) # (visdom) insert current line stats if self.viz_on and (iteration % self.viz_ll_iter == 0): # compute discriminator accuracy p_DZ = F.softmax(DZ, 1)[:, 0].detach() p_DZ_perm = F.softmax(DZ_perm, 1)[:, 0].detach() # insert line stats self.line_gather.insert(iter=iteration, p_DZ=p_DZ.mean().item(), p_DZ_perm=p_DZ_perm.mean().item(), recon=loss_recon.item(), kl=loss_kl.item(), kl_alpha=loss_kl_alpha.item()) # (visdom) visualize line stats (then flush out) if self.viz_on and (iteration % self.viz_la_iter == 0): self.visualize_line() self.line_gather.flush() # evaluate metrics if self.eval_metrics and (iteration % self.eval_metrics_iter == 0): metric1, _ = self.eval_disentangle_metric1() metric2, _ = self.eval_disentangle_metric2() prn_str = ( '********\n[iter %d (epoch %d)] ' + \ 'metric1 = %.4f, metric2 = %.4f\n********' ) % \ (iteration, epoch, metric1, metric2) print(prn_str) if self.record_file: record = open(self.record_file, 'a') record.write('%s\n' % (prn_str, )) record.close() # (visdom) visulaize metrics if self.viz_on: self.visualize_line_metrics(iteration, metric1, metric2)
def train(self, config, **kwargs): config_parameters = parse_config_or_kwargs(config, **kwargs) outputdir = os.path.join( config_parameters['outputpath'], config_parameters['model'], "{}_{}".format( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'), uuid.uuid1().hex)) checkpoint_handler = ModelCheckpoint( outputdir, 'run', n_saved=1, require_empty=False, create_dir=True, score_function=lambda engine: -engine.state.metrics['Loss'], save_as_state_dict=False, score_name='loss') train_kaldi_string = parsecopyfeats( config_parameters['trainfeatures'], **config_parameters['feature_args']) dev_kaldi_string = parsecopyfeats(config_parameters['devfeatures'], **config_parameters['feature_args']) logger = genlogger(os.path.join(outputdir, 'train.log')) logger.info("Experiment is stored in {}".format(outputdir)) for line in pformat(config_parameters).split('\n'): logger.info(line) scaler = getattr( pre, config_parameters['scaler'])(**config_parameters['scaler_args']) inputdim = -1 logger.info("<== Estimating Scaler ({}) ==>".format( scaler.__class__.__name__)) for _, feat in kaldi_io.read_mat_ark(train_kaldi_string): scaler.partial_fit(feat) inputdim = feat.shape[-1] assert inputdim > 0, "Reading inputstream failed" logger.info("Features: {} Input dimension: {}".format( config_parameters['trainfeatures'], inputdim)) logger.info("<== Labels ==>") train_label_df = pd.read_csv( config_parameters['trainlabels']).set_index('Participant_ID') dev_label_df = pd.read_csv( config_parameters['devlabels']).set_index('Participant_ID') train_label_df.index = train_label_df.index.astype(str) dev_label_df.index = dev_label_df.index.astype(str) # target_type = ('PHQ8_Score', 'PHQ8_Binary') target_type = ('PHQ8_Score', 'PHQ8_Binary') n_labels = len(target_type) # PHQ8 + Binary # Scores and their respective PHQ8 train_labels = train_label_df.loc[:, target_type].T.apply( tuple).to_dict() dev_labels = dev_label_df.loc[:, target_type].T.apply(tuple).to_dict() train_dataloader = create_dataloader( train_kaldi_string, train_labels, transform=scaler.transform, shuffle=True, **config_parameters['dataloader_args']) cv_dataloader = create_dataloader( dev_kaldi_string, dev_labels, transform=scaler.transform, shuffle=False, **config_parameters['dataloader_args']) model = getattr(models, config_parameters['model'])( inputdim=inputdim, output_size=n_labels, **config_parameters['model_args']) if 'pretrain' in config_parameters: logger.info("Loading pretrained model {}".format( config_parameters['pretrain'])) pretrained_model = torch.load(config_parameters['pretrain'], map_location=lambda st, loc: st) if 'Attn' in pretrained_model.__class__.__name__: model.lstm.load_state_dict(pretrained_model.lstm.state_dict()) else: model.net.load_state_dict(pretrained_model.net.state_dict()) logger.info("<== Model ==>") for line in pformat(model).split('\n'): logger.info(line) criterion = getattr( losses, config_parameters['loss'])(**config_parameters['loss_args']) optimizer = getattr(torch.optim, config_parameters['optimizer'])( list(model.parameters()) + list(criterion.parameters()), **config_parameters['optimizer_args']) poolingfunction = parse_poolingfunction( config_parameters['poolingfunction']) criterion = criterion.to(device) model = model.to(device) def _train_batch(_, batch): model.train() with torch.enable_grad(): optimizer.zero_grad() outputs, targets = Runner._forward(model, batch, poolingfunction) loss = criterion(outputs, targets) loss.backward() optimizer.step() return loss.item() def _inference(_, batch): model.eval() with torch.no_grad(): return Runner._forward(model, batch, poolingfunction) def meter_transform(output): y_pred, y = output # y_pred is of shape [Bx2] (0 = MSE, 1 = BCE) # y = is of shape [Bx2] (0=Mse, 1 = BCE) return torch.sigmoid(y_pred[:, 1]).round(), y[:, 1].long() precision = Precision(output_transform=meter_transform, average=False) recall = Recall(output_transform=meter_transform, average=False) F1 = (precision * recall * 2 / (precision + recall)).mean() metrics = { 'Loss': Loss(criterion), 'Recall': Recall(output_transform=meter_transform, average=True), 'Precision': Precision(output_transform=meter_transform, average=True), 'MAE': MeanAbsoluteError( output_transform=lambda out: (out[0][:, 0], out[1][:, 0])), 'F1': F1 } train_engine = Engine(_train_batch) inference_engine = Engine(_inference) for name, metric in metrics.items(): metric.attach(inference_engine, name) RunningAverage(output_transform=lambda x: x).attach( train_engine, 'run_loss') pbar = ProgressBar(persist=False) pbar.attach(train_engine, ['run_loss']) scheduler = getattr(torch.optim.lr_scheduler, config_parameters['scheduler'])( optimizer, **config_parameters['scheduler_args']) early_stop_handler = EarlyStopping( patience=5, score_function=lambda engine: -engine.state.metrics['Loss'], trainer=train_engine) inference_engine.add_event_handler(Events.EPOCH_COMPLETED, early_stop_handler) inference_engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'model': model, 'scaler': scaler, 'config': config_parameters }) @train_engine.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): inference_engine.run(cv_dataloader) validation_string_list = [ "Validation Results - Epoch: {:<3}".format(engine.state.epoch) ] for metric in metrics: validation_string_list.append("{}: {:<5.2f}".format( metric, inference_engine.state.metrics[metric])) logger.info(" ".join(validation_string_list)) pbar.n = pbar.last_print_n = 0 @inference_engine.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_loss = engine.state.metrics['Loss'] if 'ReduceLROnPlateau' == scheduler.__class__.__name__: scheduler.step(val_loss) else: scheduler.step() train_engine.run(train_dataloader, max_epochs=config_parameters['epochs']) # Return for further processing return outputdir
def test(self): # prepare dataloader (iterable) print('Start loading data...') self.data_loader = create_dataloader(self.args) print('...done') # iterator from dataloader iterator = iter(self.data_loader) iter_per_epoch = len(iterator) #----# # image synthesis if self.num_synth > 0: prn_str = 'Start doing image synthesis...' print(prn_str) self.dump_to_record(prn_str) for ii in range(self.num_synth): # save the pure-synthesis images self.save_synth_pure(str(self.ckpt_load_iter) + '_' + str(ii), howmany=100) # save the cross-modal-synthesis images self.save_synth_cross_modal( str(self.ckpt_load_iter) + '_' + str(ii)) # latent traversal if self.num_trvsl > 0: prn_str = 'Start doing latent traversal...' print(prn_str) self.dump_to_record(prn_str) # self.save_traverse_new( self.ckpt_load_iter, self.num_trvsl, # limb=-4, limu=4, inter=0.1 ) self.save_traverse_new(self.ckpt_load_iter, self.num_trvsl, limb=-3, limu=3, inter=0.1) # metric1 if self.num_eval_metric1 > 0: prn_str = 'Start evaluating metric1...' print(prn_str) self.dump_to_record(prn_str) # metric1s = np.zeros(self.num_eval_metric1) C1s = np.zeros( [self.num_eval_metric1, self.z_dim, len(self.latent_sizes)]) for ii in range(self.num_eval_metric1): metric1s[ii], C1s[ii] = self.eval_disentangle_metric1() prn_str = 'eval metric1: %d/%d done' % \ (ii+1, self.num_eval_metric1) print(prn_str) self.dump_to_record(prn_str) # prn_str = 'metric1:\n' + str(metric1s) prn_str += '\nC1:\n' + str(C1s) print(prn_str) self.dump_to_record(prn_str) # metric2 if self.num_eval_metric2 > 0: prn_str = 'Start evaluating metric2...' print(prn_str) self.dump_to_record(prn_str) # metric2s = np.zeros(self.num_eval_metric2) C2s = np.zeros( [self.num_eval_metric2, self.z_dim, len(self.latent_sizes)]) for ii in range(self.num_eval_metric2): metric2s[ii], C2s[ii] = self.eval_disentangle_metric2() prn_str = 'eval metric2: %d/%d done' % \ (ii+1, self.num_eval_metric2) print(prn_str) self.dump_to_record(prn_str) # prn_str = 'metric2:\n' + str(metric2s) prn_str += '\nC2:\n' + str(C2s) print(prn_str) self.dump_to_record(prn_str)
""" od = DetectionModel(opt.model_name, opt.model_version, opt) """ Init data """ with open(opt.data) as f: data = yaml.load(f, Loader=yaml.FullLoader) # model dict path = data['test'] nc = 1 if opt.single_cls else int(data['nc']) # number of classes iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for [email protected]:0.95 niou = iouv.numel() dataloader = create_dataloader(path, opt.input_size, opt.batch_size, 32, opt, pad=0.5, rect=False)[0] """ Make inference """ with mlflow.start_run(experiment_id=DETECTION_EXPERIMENT_ID, run_name=RUN_NAME): seen = 0 stats = [] for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader)): img, targets = img.to(device), targets.to(device) nb, _, height, width = img.shape # batch size, channels, height, width
def main(): # Instantiate DataLoader dataloader = create_dataloader() # Instantiate Model generator = Generator().to(device) discriminator = Discriminator().to(device) # Multi-gpu is desired if device.type == 'cuda' and NGPU > 0: generator = nn.DataParallel(generator, list(range(NGPU))) discriminator = nn.DataParallel(discriminator, list(range(NGPU))) # Set criterion function criterion = F.binary_cross_entropy # Set reference latent vector for training-time evaluation fixed_noise = torch.randn(64, LATENT_DIM, 1, 1, device=device) # Instantiate optimizer optim_generator = optim.Adam(generator.parameters(), lr=LR, betas=(BETA1, 0.999)) optim_discriminator = optim.Adam(discriminator.parameters(), lr=LR, betas=(BETA1, 0.999)) # Training status tracker img_list = [] D_loss = [] G_loss = [] # Announce training print(f"Begin training DCGAN on {DATASET}") # Train for epoch in range(EPOCHS): for i, real_data in enumerate(dataloader, 0): # Format real image batch real_data = real_data[0].to(device) label = torch.full((BATCH_SIZE, ), REAL_LABEL, device=device) # Fowrard pass discriminator disc_output = discriminator(real_data).squeeze() # Calculate discriminator real_loss disc_real_loss = criterion(disc_output, label) # Backward propagate real_loss discriminator.zero_grad() disc_real_loss.backward() # Generate fake image batch noise = torch.randn(BATCH_SIZE, LATENT_DIM, 1, 1, device=device) fake_data = generator(noise) label.fill_(FAKE_LABEL) # Forward pass generator disc_output = discriminator(fake_data.detach()).squeeze() # Calculate discriminator fake_loss and loss disc_fake_loss = criterion(disc_output, label) disc_loss = disc_real_loss + disc_fake_loss D_loss.append(disc_loss.item()) # Backward propagate fake_loss disc_fake_loss.backward() # Update discriminator parameters optim_discriminator.step() # Forward pass fake_data to updated discriminator disc_output = discriminator(fake_data).squeeze() # Fill label in the generator's perspective label.fill_(REAL_LABEL) # Calculate generator loss gen_loss = criterion(disc_output, label) G_loss.append(gen_loss.item()) # Backward propagate generator generator.zero_grad() gen_loss.backward() # Update generator parameters optim_generator.step() # Print training status if i % PRINT_EVERY == 0: message = f'Epochs: {epoch+1:02d}/{EPOCHS:02d}\tBatch: {i+1:04d}/{len(dataloader):04d}\tdisc_loss: {disc_loss.item():.4f}\tgen_loss: {gen_loss.item():.4f}' print(message, end='\r') sys.stdout.flush() # Evaluate training status by generating images on a fixed noise with torch.no_grad(): fake_image = generator(fixed_noise) display_batch(fake_image, f'fixed-noise-{DATASET}-z{LATENT_DIM}-e{epoch+1:02d}') # Create checkpoints torch.save( discriminator, f'saved_model/discriminator-{DATASET}-z{LATENT_DIM}-e{epoch+1:02d}' ) torch.save( generator, f'saved_model/generator-{DATASET}-z{LATENT_DIM}-e{epoch+1:02d}') display_loss(D_loss, G_loss)
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES['ernie-health'] # Loads or initialize a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-discriminator'])) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) with open(os.path.join(args.model_name_or_path, 'run_states.json'), 'r') as f: config_dict = json.load(f) model_name = config_dict['model_name'] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-discriminator'])) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams'))) else: raise ValueError( 'initialize a model from ckpt need model_name ' 'in model_config_file. The supported model_name ' 'are as follows: {}'.format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( 'initialize a model need identifier or the ' 'directory of storing model. if use identifier, the supported model ' 'identifiers are as follows: {}, if use directory, ' 'make sure set init_from_ckpt as True'.format( model_class.pretrained_init_configuration.keys())) criterion = ErnieHealthPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config['vocab_size'], model.gen_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() logger.info('start load data : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) train_dataset = MedicalCorpus(data_path=args.input_dir, tokenizer=tokenizer) logger.info('load data done, total : %s s' % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForErnieHealth( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm_prob=args.mlm_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.batch_size, mode='train', use_gpu=True if args.device in 'gpu' else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_epochs) args.num_epochs = (num_training_steps - 1) // len(train_data_loader) + 1 lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ['bias', 'norm']) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) logger.info('start train : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) trained_global_step = global_step = 0 t_loss = defaultdict(lambda: paddle.to_tensor([0.0])) log_loss = defaultdict(lambda: paddle.to_tensor([0.0])) loss_list = defaultdict(list) log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdopt'))) trained_global_step = global_step = config_dict['global_step'] if trained_global_step < num_training_steps: logger.info( '[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s' % (trained_global_step, trained_global_step + 1)) else: logger.info( '[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !' % (trained_global_step, num_training_steps)) exit(0) if paddle.distributed.get_rank() == 0: writer = LogWriter(os.path.join(args.output_dir, 'loss_log')) for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 masked_input_ids, input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) scaled = scaler.scale(loss) scaled.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) loss.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = dict([ (k, (t_loss[k] - log_loss[k]) / args.logging_steps) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) if paddle.distributed.get_world_size() > 1: for k in ['loss', 'gen', 'rtd', 'mts', 'csp']: paddle.distributed.all_gather(loss_list[k], local_loss[k]) if paddle.distributed.get_rank() == 0: tmp_loss = dict([ (k, float((paddle.stack(loss_list[k]).sum() / len(loss_list[k])).numpy())) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format( global_step, num_training_steps, epoch, step, tmp_loss['loss'], tmp_loss['gen'], tmp_loss['rtd'], tmp_loss['mts'], tmp_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) writer.add_scalar('generator_loss', tmp_loss['gen'], global_step) writer.add_scalar('rtd_loss', tmp_loss['rtd'] * 50, global_step) writer.add_scalar('mts_loss', tmp_loss['mts'] * 20, global_step) writer.add_scalar('csp_loss', tmp_loss['csp'], global_step) writer.add_scalar('total_loss', tmp_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) loss_list = defaultdict(list) else: local_loss = dict([(k, v.numpy()[0]) for k, v in local_loss.items()]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive_loss: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format(global_step, num_training_steps, epoch, step, local_loss['loss'], local_loss['gen'], local_loss['rtd'], local_loss['mts'], local_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) loss_dict = { 'generator_loss': local_loss['gen'], 'rtd_loss': local_loss['rtd'] * 50, 'mts_loss': local_loss['mts'] * 20, 'csp_loss': local_loss['csp'] } for k, v in loss_dict.items(): writer.add_scalar('loss/%s' % k, v, global_step) writer.add_scalar('total_loss', local_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) log_loss = dict(t_loss) tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, 'model_%d.pdparams' % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { 'model_name': model_name if args.init_from_ckpt else args.model_name_or_path, 'global_step': global_step, 'epoch': epoch, 'step': step, } with open(os.path.join(output_dir, 'model_config.json'), 'w') as f: json.dump(config_to_save, f) with open(os.path.join(output_dir, 'run_states.json'), 'w') as f: json.dump(run_states, f) paddle.save( model.state_dict(), os.path.join(output_dir, 'model_state.pdparams')) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, 'model_state.pdopt')) if len(log_list) > 0: with open(os.path.join(output_dir, 'train.log'), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: if paddle.distributed.get_rank() == 0: writer.close() return
import warnings warnings.simplefilter("ignore") import torchvision from fastai.vision.all import * from fastbook import * from model_wrapped import FineTuneModel, ModelInterface from dataset import create_dataloader if __name__ == "__main__": path = untar_data(URLs.MNIST_SAMPLE) Path.BASE_PATH = path csv_file = path.ls()[1] root_dir = path path_to_model = '' num_epoch = 10 train_loader, val_loader, num_classes = create_dataloader( csv_file=csv_file, root_dir=root_dir) if path_to_model: model = torch.load(path_to_model) else: model = torchvision.models.resnext50_32x4d(pretrained=True) model = FineTuneModel(model, num_classes=num_classes) model = ModelInterface(model=model) model.train(train_loader, val_loader, num_epochs=num_epoch)
def train(config=None, config_test=None): torch.backends.cudnn.benchmark = True config = parse_train_config() if not config else config transform = A.Compose([ M.MyRandomResizedCrop(width=config.IMAGE_SIZE, height=config.IMAGE_SIZE), A.OneOf([ A.MotionBlur(p=0.2), A.MedianBlur(blur_limit=3, p=0.1), A.Blur(blur_limit=3, p=0.1), ], p=0.2), A.OneOf([ M.MyOpticalDistortion(p=0.3), M.MyGridDistortion(p=0.1), ], p=0.2), A.OneOf([ A.IAASharpen(), A.IAAEmboss(), A.RandomBrightnessContrast(), ], p=0.3), A.Normalize(), M.MyToTensorV2(), ], additional_targets={ 'right_img': 'image', 'left_normal': 'normal', 'right_normal': 'normal', }) _, dataloader = create_dataloader(config.DATASET_ROOT, config.JSON_PATH, batch_size=config.BATCH_SIZE, transform=transform, workers=config.WORKERS, pin_memory=config.PIN_MEMORY, shuffle=config.SHUFFLE) model = Model() model.apply(init_weights) solver = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.LEARNING_RATE, betas=config.BETAS, eps=config.EPS, weight_decay=config.WEIGHT_DECAY) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( solver, milestones=config.MILESTONES, gamma=config.GAMMA) model = model.to(DEVICE) loss_fn = LossFunction() epoch_idx = 0 if config.CHECKPOINT_FILE and config.LOAD_MODEL: epoch_idx, model = load_checkpoint(model, config.CHECKPOINT_FILE, DEVICE) output_dir = os.path.join( config.OUT_PATH, re.sub("[^0-9a-zA-Z]+", "-", dt.now().isoformat())) for epoch_idx in range(epoch_idx, config.NUM_EPOCHS): metric_fn = MetricFunction(config.BATCH_SIZE) model.train() train_one_epoch(model, dataloader, loss_fn, metric_fn, solver, epoch_idx) print_single_error(epoch_idx, loss_fn.show(), metric_fn.show()) lr_scheduler.step() if config.TEST: test(model, config_test) if config.SAVE_MODEL: save_checkpoint(epoch_idx, model, output_dir) if not config.TEST: test(model, config_test) if not config.SAVE_MODEL: save_checkpoint(epoch_idx, model, output_dir)
if __name__=="__main__": opt = parse_args() print(opt) print('data path: ', opt.data_path) data_split = opt.data_path.split('/')[-2] print('data_split', data_split) # replace_tokens = ["@R_%d@"%x for x in range(0,opt.num_replacements+1)] replace_tokens = ["@R_%d@"%x for x in range(1000)] model = Code2Seq.load_from_checkpoint(checkpoint_path=opt.expt_dir) data_loader, n_samples = create_dataloader( opt.data_path, model.hyperparams.max_context, False, False, opt.batch_size, 1, ) vocab = pickle.load(open(opt.vocab, 'rb')) token_to_id = vocab['token_to_id'] id_to_token = {token_to_id[t]:t for t in token_to_id} print('length: ', len(id_to_token)) label_to_id = vocab['label_to_id'] id_to_label = {label_to_id[t]:t for t in label_to_id} # if data_split == 'test' and opt.exact_matches: # print('Reducing dataset...') # li_exact_matches = get_exact_matches(data, model, input_vocab, output_vocab, opt, device) # with open('/mnt/outputs/exact_matches_idxs.json', 'w') as f:
args = parser.parse_args() return args def create_datafile(data_path, exact_matches, split): new_data_path = os.path.join(data_path, 'small.{}.c2s'.format(split)) lines = open(os.path.join(data_path, 'data.{}.c2s'.format(split)), 'r') new_file = open(new_data_path, 'w') for line in lines: if line.split()[0] in exact_matches: new_file.write(line) print("Saved exact matches.") if __name__ == '__main__': args = parse_args() model = Code2Seq.load_from_checkpoint(checkpoint_path=args.checkpoint) data_loader, n_samples = create_dataloader( os.path.join(args.orig_data_path, args.split), model.hyperparams.max_context, False, False, args.batch_size, 1) vocab = pickle.load(open(args.vocab_path, 'rb')) label_to_id = vocab['label_to_id'] id_to_label = {label_to_id[l]: l for l in label_to_id} li_exact_matches = get_exact_matches(data_loader, n_samples, model, id_to_label) print(li_exact_matches) create_datafile(args.data_path, li_exact_matches, args.split)
from datetime import timedelta, datetime, tzinfo import pytz import pandas as pd if __name__ == "__main__": batch_size = 64 exp_id = "base_1_5_2" start_time = datetime.now().astimezone( pytz.timezone("America/Los_Angeles")) start_time_string = start_time.strftime("%Y-%m-%d-%H-%M-%S") checkpoint = "/home/ubuntu/Utterance2Phoneme/checkpoints/base_1_5_2_2020-03-31-17-02-51/013_9.161844" # checkpoint = "" print(f"Start Time: {start_time_string}") valid_dataloader = create_dataloader("./data/wsj0_dev.npy", "./data/wsj0_dev_merged_labels.npy", batch_size=batch_size, shuffle=False) train_dataloader = create_dataloader("./data/wsj0_train", "./data/wsj0_train_merged_labels.npy", batch_size=batch_size, shuffle=True) test_dataloader = create_dataloader("./data/wsj0_test", None, batch_size=batch_size, test=True, shuffle=False) model = BiLSTM(40, 256, 47, 5, use_gpu=True) # model = Model(40, 47, 256)
def pt_2(): epochs = 25 H = 120 W = 160 face_dataset = FaceDataset(1, 32, root_dir, W, H, CustomTransforms()) training = create_dataloader(face_dataset, 5) validation_dataset = FaceDataset(33, 40, root_dir, W, H, CustomTransforms()) validation = create_dataloader(validation_dataset, 1) #test_dataloader(training, W, H) net = Net() loss = nn.MSELoss() opt = Adam(net.parameters(), lr=0.001) training_losses = [] validation_losses = [] for epoch in range(epochs): epoch_loss = torch.zeros((1, 1)) for i, (images, labels) in enumerate(training): prediction = net(images) output = loss(prediction, labels.type(torch.float32).view(-1, 116)) epoch_loss += output output.backward() opt.step() opt.zero_grad() epoch_loss = epoch_loss / len(face_dataset) print("EPOCH " + str(i) + " LOSS: " + str(epoch_loss)) training_losses.append([epoch, epoch_loss.item() * 100]) epoch_loss = torch.zeros((1, 1), requires_grad=False) for i, (images, labels) in enumerate(validation): prediction = net(images) output = loss(prediction, labels.type(torch.float32).view(-1, 116)) epoch_loss += output opt.zero_grad() epoch_loss = epoch_loss / len(face_dataset) validation_losses.append([epoch, epoch_loss.item() * 100]) training_losses = np.array(training_losses) validation_losses = np.array(validation_losses) plt.plot(training_losses[:, 0], training_losses[:, 1]) plt.plot(validation_losses[:, 0], validation_losses[:, 1]) plt.plot() plt.savefig('results/pt_2/epoch_loss_decrease.png') plt.show() """ Handy visualization code copied and pasted from: https://colab.research.google.com/github/Niranjankumar-c/DeepLearning-PadhAI/blob/master/DeepLearning_Materials/6_VisualizationCNN_Pytorch/CNNVisualisation.ipynb#scrollTo=cWmfCalUvzbS as linked on the piazza. """ def plot_filters_single_channel(i, t): #kernels depth * number of kernels nplots = t.shape[0] * t.shape[1] ncols = 12 nrows = 1 + nplots // ncols #convert tensor to numpy image npimg = np.array(t.numpy(), np.float32) count = 0 fig = plt.figure(figsize=(ncols, nrows)) #looping through all the kernels in each channel for i in range(t.shape[0]): for j in range(t.shape[1]): count += 1 ax1 = fig.add_subplot(nrows, ncols, count) npimg = np.array(t[i, j].numpy(), np.float32) npimg = (npimg - np.mean(npimg)) / np.std(npimg) npimg = np.minimum(1, np.maximum(0, (npimg + 0.5))) ax1.imshow(npimg) ax1.set_title(str(i) + ',' + str(j)) ax1.axis('off') ax1.set_xticklabels([]) ax1.set_yticklabels([]) plt.tight_layout() plt.savefig(str(i) + 'weight_visualization.png') plt.show() for i in range(len(net.conv)): if i == 0: plot_filters_single_channel(i, net.conv[i].weight.data) validation_dataset = FaceDataset(33, 40, root_dir, W, H, CustomTransforms()) dataloader = create_dataloader(validation_dataset, 1) with torch.no_grad(): for i, (image, label) in enumerate(dataloader): prediction = net(image) output = loss(prediction, label.type(torch.float32).view(-1, 116)) print("LOSS FOR IMAGE IS: " + str(output)) prediction = prediction.view(-1, 58, 2) plt.imshow(image[0][0], cmap='gray') plt.scatter(prediction[0, :, 0] * W, prediction[0, :, 1] * H, s=10, marker='o', c='r') plt.scatter(label[0, :, 0] * W, label[0, :, 1] * H, marker='o', color='green') plt.savefig('results/prediction_' + str(i) + '_' + str(epochs)) plt.show()
def evaluate(self, experiment_path: str, outputfile: str = 'results.csv', **kwargs): """Prints out the stats for the given model ( MAE, RMSE, F1, Pre, Rec) """ config = torch.load(glob.glob( "{}/run_config*".format(experiment_path))[0], map_location=lambda storage, loc: storage) model = torch.load(glob.glob( "{}/run_model*".format(experiment_path))[0], map_location=lambda storage, loc: storage) scaler = torch.load(glob.glob( "{}/run_scaler*".format(experiment_path))[0], map_location=lambda storage, loc: storage) config_parameters = dict(config, **kwargs) dev_features = config_parameters['devfeatures'] dev_label_df = pd.read_csv( config_parameters['devlabels']).set_index('Participant_ID') dev_label_df.index = dev_label_df.index.astype(str) dev_labels = dev_label_df.loc[:, ['PHQ8_Score', 'PHQ8_Binary' ]].T.apply(tuple).to_dict() outputfile = os.path.join(experiment_path, outputfile) y_score_true, y_score_pred, y_binary_pred, y_binary_true = [], [], [], [] poolingfunction = parse_poolingfunction( config_parameters['poolingfunction']) dataloader = create_dataloader(dev_features, dev_labels, transform=scaler.transform, batch_size=1, num_workers=1, shuffle=False) model = model.to(device).eval() with torch.no_grad(): for batch in dataloader: output, target = Runner._forward(model, batch, poolingfunction) y_score_pred.append(output[:, 0].cpu().numpy()) y_score_true.append(target[:, 0].cpu().numpy()) y_binary_pred.append( torch.sigmoid(output[:, 1]).round().cpu().numpy()) y_binary_true.append(target[:, 1].cpu().numpy()) y_score_true = np.concatenate(y_score_true) y_score_pred = np.concatenate(y_score_pred) y_binary_pred = np.concatenate(y_binary_pred) y_binary_true = np.concatenate(y_binary_true) with open(outputfile, 'w') as wp: pre = metrics.precision_score(y_binary_true, y_binary_pred, average='macro') rec = metrics.recall_score(y_binary_true, y_binary_pred, average='macro') f1 = 2 * pre * rec / (pre + rec) rmse = np.sqrt( metrics.mean_squared_error(y_score_true, y_score_pred)) mae = metrics.mean_absolute_error(y_score_true, y_score_pred) df = pd.DataFrame( { 'precision': pre, 'recall': rec, 'F1': f1, 'MAE': mae, 'RMSE': rmse }, index=["Macro"]) df.to_csv(wp, index=False) print(tabulate(df, headers='keys')) return df
def test(self): ones = torch.ones(self.batch_size, dtype=torch.long) zeros = torch.zeros(self.batch_size, dtype=torch.long) if self.use_cuda: ones = ones.cuda() zeros = zeros.cuda() # prepare dataloader (iterable) print('Start loading data...') self.data_loader = create_dataloader(self.args) print('...done') # iterator from dataloader iterator = iter(self.data_loader) iter_per_epoch = len(iterator) #----# # image synthesis if self.num_trvsl > 0: prn_str = 'Start doing image synthesis...' print(prn_str) self.dump_to_record(prn_str) for ii in range(self.num_synth): self.save_synth(str(self.ckpt_load_iter) + '_' + str(ii), howmany=100) # latent traversal if self.num_trvsl > 0: prn_str = 'Start doing latent traversal...' print(prn_str) self.dump_to_record(prn_str) # self.save_traverse_new( self.ckpt_load_iter, self.num_trvsl, # limb=-4, limu=4, inter=0.1 ) self.save_traverse_new(self.ckpt_load_iter, self.num_trvsl, limb=-16, limu=16, inter=0.2) # metric1 if self.num_eval_metric1 > 0: prn_str = 'Start evaluating metric1...' print(prn_str) self.dump_to_record(prn_str) # metric1s = np.zeros(self.num_eval_metric1) C1s = np.zeros( [self.num_eval_metric1, self.z_dim, len(self.latent_sizes)]) for ii in range(self.num_eval_metric1): metric1s[ii], C1s[ii] = self.eval_disentangle_metric1() prn_str = 'eval metric1: %d/%d done' % \ (ii+1, self.num_eval_metric1) print(prn_str) self.dump_to_record(prn_str) # prn_str = 'metric1:\n' + str(metric1s) prn_str += '\nC1:\n' + str(C1s) print(prn_str) self.dump_to_record(prn_str) # metric2 if self.num_eval_metric2 > 0: prn_str = 'Start evaluating metric2...' print(prn_str) self.dump_to_record(prn_str) # metric2s = np.zeros(self.num_eval_metric2) C2s = np.zeros( [self.num_eval_metric2, self.z_dim, len(self.latent_sizes)]) for ii in range(self.num_eval_metric2): metric2s[ii], C2s[ii] = self.eval_disentangle_metric2() prn_str = 'eval metric2: %d/%d done' % \ (ii+1, self.num_eval_metric2) print(prn_str) self.dump_to_record(prn_str) # prn_str = 'metric2:\n' + str(metric2s) prn_str += '\nC2:\n' + str(C2s) print(prn_str) self.dump_to_record(prn_str) #----# if self.losses or self.num_recon > 0: num_adds = 0 loss_kl_inds = np.zeros(self.z_dim) losses = {} losses['vae_loss'] = 0.0 losses['dis_loss'] = 0.0 losses['recon'] = 0.0 losses['kl'] = 0.0 losses['tc'] = 0.0 losses['pv_reg'] = 0.0 cntdn = self.num_recon else: return prn_str = 'Start going through the entire data...' print(prn_str) self.dump_to_record(prn_str) for iteration in range(1, 100000000): # reset data iterators for each epoch if iteration % iter_per_epoch == 0: # inidividual kls loss_kl_inds /= num_adds prn_str = "Individual kl's:\n" + str(loss_kl_inds) print(prn_str) self.dump_to_record(prn_str) # losses losses['vae_loss'] /= num_adds losses['dis_loss'] /= num_adds losses['recon'] /= num_adds losses['kl'] /= num_adds losses['tc'] /= num_adds losses['pv_reg'] /= num_adds prn_str = "losses:\n" + str(losses) print(prn_str) self.dump_to_record(prn_str) break with torch.no_grad(): # sample a mini-batch X, ids = next(iterator) # (n x C x H x W) if self.use_cuda: X = X.cuda() # enc(X) mu, std, logvar = self.encoder(X) # relevance vector pv, logpv = self.pvnet() # kl loss kls = 0.5 * ( \ -1 - logvar + logpv + (mu**2+std**2)/pv ) # (n x z_dim) loss_kl = kls.sum(1).mean() # reparam'ed samples if self.use_cuda: Eps = torch.cuda.FloatTensor(mu.shape).normal_() else: Eps = torch.randn(mu.shape) Z = mu + Eps * std # dec(Z) X_recon = self.decoder(Z) # recon loss loss_recon = F.binary_cross_entropy_with_logits( X_recon, X, reduction='sum').div(X.size(0)) # dis(Z) DZ = self.D(Z) # tc loss loss_tc = (DZ[:, 0] - DZ[:, 1]).mean() # prior variance regularizer loss_pv_reg = ((pv - 1.0)**2).sum() # total loss for vae vae_loss = loss_recon + loss_kl + self.gamma*loss_tc + \ self.eta*loss_pv_reg # dim-wise permutated Z over the mini-batch perm_Z = [] for zj in Z.split(1, 1): idx = torch.randperm(Z.size(0)) perm_zj = zj[idx] perm_Z.append(perm_zj) Z_perm = torch.cat(perm_Z, 1) Z_perm = Z_perm.detach() # dis(Z_perm) DZ_perm = self.D(Z_perm) # discriminator loss dis_loss = 0.5 * (F.cross_entropy(DZ, zeros) + F.cross_entropy(DZ_perm, ones)) if self.losses: loss_kl_ind = 0.5 * ( \ -1 - logvar + logpv + (mu**2+std**2)/pv ).mean(0) loss_kl_inds += loss_kl_ind.cpu().detach().numpy() # losses['vae_loss'] += vae_loss.item() losses['dis_loss'] += dis_loss.item() losses['recon'] += loss_recon.item() losses['kl'] += loss_kl.item() losses['tc'] += loss_tc.item() losses['pv_reg'] += loss_pv_reg.item() # num_adds += 1 # print the losses if iteration % 100 == 0: prn_str = ( '[%d/%d] vae_loss: %.3f | dis_loss: %.3f\n' + \ ' (recon: %.3f, kl: %.3f, tc: %.3f, pv_reg: %.3f)' \ ) % \ ( iteration, iter_per_epoch, vae_loss.item(), dis_loss.item(), loss_recon.item(), loss_kl.item(), loss_tc.item(), loss_pv_reg.item() ) prn_str += '\n pv = {}'.format( pv.detach().cpu().numpy().round(2)) print(prn_str) self.dump_to_record(prn_str) # save reconstructed images if cntdn > 0: self.save_recon(iteration, X, torch.sigmoid(X_recon).data) cntdn -= 1 if cntdn == 0: prn_str = 'Completed image reconstruction' print(prn_str) self.dump_to_record(prn_str) if not self.losses: break
# ********************************************************************** # Wrap the msglogger into this two modules, one for tensorboard visualizarion # the other is just send the msglogger to connect other summary functions. # ********************************************************************** tflogger = config.TensorBoardLogger(msglogger.logdir) pylogger = config.PythonLogger(msglogger) if args.train: # Create dataloaders and the table for dataset size information: train_labels_name = os.listdir(os.path.join(args.label_dir, 'train')) test_label_name = os.listdir(os.path.join(args.label_dir, 'test')) train_labels = [] for i in train_labels_name: train_labels.append(os.path.join(args.label_dir, 'train', i)) test_labels = os.path.join(args.label_dir, 'test', test_label_name[0]) dataloaders, dataset_sizes = dataset.create_dataloader( args, args.data_dir, train_labels, test_labels) # Create the darknet and load weights here: model = darknet.Darknet(args.config_path, img_size=args.image_size) if args.pretrained: model.load_darknet_weights(args.weight_path) model.arch = args.arch model.dataset = args.dataset model.input_shape = (1, 3, args.image_size, args.image_size ) # For channel first. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Leverage {} device to run this task.".format(device)) if args.cpu: device = torch.device("cpu") model.to(device)