def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_path", type=str, default="data.joblib") parser.add_argument("--test_strat", type=int, default=0) parser.add_argument("--device_id", type=int, default=0) parser.add_argument("--num_epochs_s2cnn", type=int, default=30) parser.add_argument("--num_epochs_mlp", type=int, default=30) parser.add_argument("--batch_size_s2cnn", type=int, default=32) parser.add_argument("--batch_size_mlp", type=int, default=32) parser.add_argument("--init_learning_rate_s2cnn", type=int, default=1e-3) parser.add_argument("--learning_rate_mlp", type=int, default=1e-3) parser.add_argument("--learning_rate_decay_epochs", type=int, default=10) args = parser.parse_args() torch.cuda.set_device(args.device_id) print("evaluating on {}".format(args.test_strat)) print("loading data...", end="") data, train_idxs, test_idxs = load_data(args.data_path, args.test_strat, cuda=args.device_id) print("done!") mlp = BaselineRegressor() s2cnn = S2CNNRegressor() if torch.cuda.is_available(): for model in [mlp, s2cnn]: model.cuda(args.device_id) print("training baseline model") print("mlp #params: {}".format(count_params(mlp))) train_baseline( mlp, data, IndexBatcher(train_idxs, args.batch_size_mlp, cuda=args.device_id), IndexBatcher(test_idxs, args.batch_size_mlp, cuda=args.device_id), args.num_epochs_mlp, args.learning_rate_mlp, args.device_id) print("training residual s2cnn model") print("s2cnn #params: {}".format(count_params(s2cnn))) train_s2cnn( mlp, s2cnn, data, IndexBatcher(train_idxs, args.batch_size_s2cnn, cuda=args.device_id), IndexBatcher(test_idxs, args.batch_size_s2cnn, cuda=args.device_id), args.num_epochs_s2cnn, args.init_learning_rate_s2cnn, args.learning_rate_decay_epochs, args.device_id)
def test_conv(Base, nruns=100, device='cuda'): # will do basic sanity check, we want to get same spatial # dimension with custom convolutions as in standard module # to swap convolution types conveniently chi, cho, k, s = 8, 32, 3, 1 x = torch.randn(16, chi, 512, 512) conv = Base(chi, cho, k, s, autopad(k)) conv_ = nn.Conv2d(chi, cho, k, s, autopad(k)) if 'cuda' in device: assert torch.cuda.is_available() conv.cuda().train() conv_.cuda().train() x = x.cuda() if torch.backends.cudnn.benchmark: # have to do warm up iterations for fair comparison print('benchmark warm up...') for _ in range(50): _ = conv(x) else: conv.cpu().train() conv_.cpu().train() nruns = 1 p = count_params(conv) p_ = count_params(conv_) # relative number of parameter change in brackets w.r.t. nn.conv2d print(f'Number of parameters: {p} ({p / p_ * 100:.2f}%)') # ensure same behaviour as standard module out = conv(x) out_ = conv_(x) assert out.shape == out_.shape, f'Shape missmatch, should be {out_.shape} but is {out.shape}' # g0 = torch.randn_like(out) # performance test without feature/target loading # because that would require a significant amount of overhead start = time_synchronized() for _ in range(nruns): out = conv(x) for param in conv.parameters(): param.grad = None out.mean().backward() # out.backward(g0) end = time_synchronized() print(f'Forward + Backward time: {(end - start) * 1000 / nruns:.3f}ms')
def test_generator(generator, true_count): tf.reset_default_graph() with get_session() as sess: y = generator(tf.ones((1, 96)), 3) cur_count = count_params() if cur_count != true_count: print( 'Incorrect number of parameters in generator. {0} instead of {1}. Check your achitecture.' .format(cur_count, true_count)) else: print('Correct number of parameters in generator.')
x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True) x = self.tcn3(x) out = x out_channels = out.size(1) out = out.view(N, M, out_channels, -1) out = out.mean(3) # Global Average Pooling (Spatial+Temporal) out = out.mean(1) # Average pool number of bodies in the sequence out = self.fc(out) return out if __name__ == "__main__": # For debugging purposes import sys sys.path.append('..') model = Model(num_class=60, num_point=25, num_person=2, num_gcn_scales=13, num_g3d_scales=6, graph='graph.ntu_rgb_d.AdjMatrixGraph') N, C, T, V, M = 6, 3, 50, 25, 2 x = torch.randn(N, C, T, V, M) model.forward(x) print('Model total # params:', count_params(model))
def build(self): """ Builds a multi-tower model """ with tf.device('/cpu:0'): assert self.batch_size % self.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') with tf.name_scope('Input_splits'): tower_inputs = [[] for i in range(self.num_gpus)] for inp in self.Inputs: splits = tf.split(inp, self.num_gpus, name=inp.name[:-2]) for i, s in enumerate(splits): tower_inputs[i].append(s) tower_outputs = [] tower_losses = [] tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(self.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('tower', i)) as scope: # Calculate the loss for one tower of the model. This function # constructs the entire model but shares the variables across # all towers. outputs, losses, grads = self._build_train_tower( tower_inputs[i], int(self.batch_size / self.num_gpus), reuse=i > 0 or self.model_built) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Save summaries from tower_1 if i == 0: summaries = tf.get_collection( tf.GraphKeys.SUMMARIES, scope) tower_outputs.append(outputs) tower_losses.append(losses) tower_grads.append(grads) with tf.name_scope('Concat_outputs'): outputs = [[] for _ in tower_outputs[0]] for t_outputs in tower_outputs: for i, output in enumerate(t_outputs): outputs[i].append(output) self.outputs = [] for outs in outputs: self.outputs.append(tf.concat(outs, 0)) with tf.name_scope('Concat_losses'): losses = [[] for _ in range(len(tower_losses[0]))] for t_losses in tower_losses: for i, loss in enumerate(t_losses): losses[i].append(loss) with tf.name_scope('Average_grads'): var_grads = [[] for _ in range(len(tower_grads[0]))] for t_grads in tower_grads: for i, grad in enumerate(t_grads): var_grads[i].append(grad) avg_grads = [] for v_grads in var_grads: avg_grads.append(ops.average_gradients(v_grads)) if self.grad_summ: # Add histograms for gradients. with tf.name_scope('Grad_summary'): grads_summ = [] for var_grads in avg_grads: for grad, var in var_grads: if grad is not None: grads_summ.append( tf.summary.histogram( self._remove_tower_name_prefix(var) + '/Grads', grad)) summaries.append(tf.summary.merge(grads_summ)) if self.var_summ: # Add histograms for trainable variables. t_vars = tf.trainable_variables() with tf.name_scope('Var_summary'): vars_summ = [] for var in t_vars: vars_summ.append( tf.summary.histogram( self._remove_tower_name_prefix(var), var)) summaries.append(tf.summary.merge(vars_summ)) summaries += self.additional_summaries() self._tower_outputs(self.outputs) self._build_train_ops(losses, avg_grads) self.summary_op = tf.summary.merge(summaries, name='summary_op') self.saver = tf.train.Saver() self.model_built = True utils.count_params()
def main(): args = parse_args() #args.dataset = "datasets" if args.name is None: if args.deepsupervision: args.name = '%s_%s_wDS' % (args.dataset, args.arch) else: args.name = '%s_%s_woDS' % (args.dataset, args.arch) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) # define loss function (criterion) if args.loss == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = losses.__dict__[args.loss]().cuda() cudnn.benchmark = True # Data loading code img_paths = glob( r'D:\Project\CollegeDesign\dataset\Brats2018FoulModel2D\trainImage\*') mask_paths = glob( r'D:\Project\CollegeDesign\dataset\Brats2018FoulModel2D\trainMask\*') train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \ train_test_split(img_paths, mask_paths, test_size=0.2, random_state=41) print("train_num:%s" % str(len(train_img_paths))) print("val_num:%s" % str(len(val_img_paths))) # create model print("=> creating model %s" % args.arch) model = FCN.__dict__[args.arch](args) model = model.cuda() print(count_params(model)) if args.optimizer == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug) val_dataset = Dataset(args, val_img_paths, val_mask_paths) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) log = pd.DataFrame( index=[], columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) best_iou = 0 trigger = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch, args.epochs)) # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_log = validate(args, val_loader, model, criterion) print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' % (train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'])) tmp = pd.Series( [ epoch, args.lr, train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'], ], index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) log = log.append(tmp, ignore_index=True) log.to_csv('models/%s/log.csv' % args.name, index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save(model.state_dict(), 'models/%s/model.pth' % args.name) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 # early stopping if not args.early_stop is None: if trigger >= args.early_stop: print("=> early stopping") break torch.cuda.empty_cache()
inverse_model.load_state_dict(torch.load('./models/inverse_model.pth')['model_state_dict']) model = TandemNet(forward_model, inverse_model) optimizer = torch.optim.Adam(model.inverse_model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay']) elif args.model in ['vae']: model = cVAE(configs['input_dim'], configs['latent_dim']).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay']) elif args.model in ['gan']: model = cGAN(configs['input_dim'], configs['output_dim'], configs['noise_dim']).to(DEVICE) model.apply(weights_init_normal) optimizer_G = torch.optim.Adam(model.generator.parameters(), lr=configs['g_learning_rate'], weight_decay=configs['weight_decay']) optimizer_D = torch.optim.Adam(model.discriminator.parameters(), lr=configs['d_learning_rate'], weight_decay=configs['weight_decay']) print('Model {}, Number of parameters {}'.format(args.model, count_params(model))) criterion = torch.nn.BCELoss() trainer = GANTrainer(model, optimizer_G, optimizer_D, train_loader, val_loader, test_loader, criterion, configs['epochs'], args.model) trainer.fit() sys.exit(0) elif args.model in ['inn']: model = INN(configs['ndim_total'], configs['input_dim'], configs['output_dim'], dim_z = configs['latent_dim']).to(DEVICE) print('Model {}, Number of parameters {}'.format(args.model, count_params(model))) optimizer = torch.optim.Adam(model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay']) criterion = torch.nn.MSELoss() trainer = INNTrainer(model, optimizer, train_loader, val_loader, test_loader, criterion, configs['epochs'], args.model) trainer.fit()
self.outconvm1 = nn.Conv2d(64, out_chan, 1) def forward(self, x): x, y1 = self.down1(x) x, y2 = self.down2(x) x, y3 = self.down3(x) x, y4 = self.down4(x) x = F.dropout2d(F.relu(self.bn1(self.conv1(x)))) x = F.dropout2d(F.relu(self.bn2(self.conv2(x)))) x = self.up4(x, y4) x = self.up3(x, y3) x = self.up2(x, y2) x = self.up1(x, y1) x1 = self.outconv(x) return x1 if __name__ == '__main__': model = Unet(nn.Module) args = None # create model device = 'cpu' models = model.to(device) # solution: 1 model = models.cpu() summary(model, (4, 160, 160)) # print(model) print(count_params(model))
def main(): parser = argparse.ArgumentParser() # Model and data are required parser.add_argument( "--dir_pretrained_model", type=str, required=True, help= "Dir containing pre-trained model (checkpoint), which may have been fine-tuned already." ) # Required for certain modes (--resume, --do_train, --eval_during_training, --do_eval or --do_pred) parser.add_argument( "--dir_train", type=str, help= ("Dir containing training data (n files named <lang>.train containing unlabeled text)" )) parser.add_argument( "--dir_output", type=str, help= "Directory in which model will be written (required if --do_train (but not --resume) or --do_pred)" ) parser.add_argument( "--path_dev", type=str, help="Path of 2-column TSV file containing labeled validation examples." ) parser.add_argument( "--path_test", type=str, required=False, help="Path of text file containing unlabeled test examples.") # Execution modes parser.add_argument( "--resume", action="store_true", help= "Resume training model in --dir_pretrained_model (note: --dir_output will be ignored)" ) parser.add_argument("--do_train", action="store_true", help="Run training") parser.add_argument("--eval_during_training", action="store_true", help="Run evaluation on dev set during training") parser.add_argument("--do_eval", action="store_true", help="Evaluate model on dev set") parser.add_argument("--do_pred", action="store_true", help="Run prediction on test set") # Score to optimize on dev set (by early stopping) parser.add_argument( "--score_to_optimize", choices=["track1", "track2", "track3"], default="track3", help="Score to optimize on dev set during training (by early stopping)." ) # Hyperparameters parser.add_argument( "--freeze_encoder", action="store_true", help= "Freeze weights of pre-trained encoder. (Note: in this case, we do not keep doing MLM.)" ) parser.add_argument( "--no_mlm", action="store_true", help= "Do not keep doing masked language modeling (MLM) during fine-tuning.") parser.add_argument( "--sampling_alpha", type=float, default=1.0, help= "Dampening factor for relative frequencies used to compute language sampling probabilities" ) parser.add_argument( "--weight_relevant", type=float, default=1.0, help= "Relative sampling frequency of relevant languages wrt irrelevant languages" ) parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for evaluation.") parser.add_argument( "--seq_len", default=128, type=int, help= "Length of input sequences. Shorter seqs are padded, longer ones are trucated" ) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for AdamW optimizer.") parser.add_argument("--equal_betas", action='store_true', help="Use beta1=beta2=0.9 for AdamW optimizer.") parser.add_argument( "--correct_bias", action='store_true', help= "Correct bias in AdamW optimizer (correct_bias=False is meant to reproduce BERT behaviour exactly." ) parser.add_argument( "--max_train_steps", default=1000000, type=int, help= "Maximum number of training steps to perform. Note: # optimization steps = # train steps / # accumulation steps." ) parser.add_argument( "--num_train_steps_per_epoch", default=1000, type=int, help= "Number of training steps that equals one epoch. Note: # optimization steps = # train steps / # accumulation steps." ) parser.add_argument( '--grad_accum_steps', type=int, default=1, help= "Number of training steps (i.e. batches) to accumualte before performing a backward/update pass." ) parser.add_argument( "--num_gpus", type=int, default=-1, help="Num GPUs to use for training (0 for none, -1 for all available)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() # Distributed or parallel? if args.local_rank != -1 or args.num_gpus > 1: raise NotImplementedError( "No distributed or parallel training available at the moment.") if torch.cuda.is_available(): args.device = torch.device("cuda") args.n_gpu = 1 else: args.device = torch.device("cpu") args.n_gpu = 0 # Check execution mode assert args.resume or args.do_train or args.do_eval or args.do_pred if args.resume: assert not args.do_train assert not args.do_eval assert not args.do_pred # Load checkpoint. This contains a pre-trained model which may or # may not have been fine-tuned for language identification already logger.info("Loading checkpoint...") checkpoint_path = os.path.join(args.dir_pretrained_model, "checkpoint.tar") checkpoint_data = torch.load(checkpoint_path) if args.resume: # Check progress logger.info("Resuming training. Currently at training step %d" % checkpoint_data["global_step"]) # Replace args with initial args for this job, except for # num_gpus, seed and model directory current_num_gpus = args.n_gpu current_dir_pretrained_model = args.dir_pretrained_model args = deepcopy(checkpoint_data["initial_args"]) args.num_gpus = current_num_gpus args.dir_pretrained_model = current_dir_pretrained_model args.resume = True logger.info("Args (most have been reloaded from checkpoint): %s" % args) else: if args.eval_during_training: assert args.do_train if args.do_train or args.do_pred: assert args.dir_output is not None if os.path.exists(args.dir_output) and os.path.isdir( args.dir_output) and len(os.listdir(args.dir_output)) > 1: msg = "%s already exists and is not empty" % args.dir_output raise ValueError(msg) if not os.path.exists(args.dir_output): os.makedirs(args.dir_output) if args.do_train: assert args.dir_train is not None train_paths = glob.glob(os.path.join(args.dir_train, "*.train")) assert len(train_paths) > 0 checkpoint_data["initial_args"] = args if args.do_train and args.freeze_encoder and not args.no_mlm: logger.warning( "Setting --no_mlm to True since --freeze_encoder is True, therefore doing MLM would be pointless." ) args.no_mlm = True if args.do_eval or args.eval_during_training: assert args.path_dev is not None assert os.path.exists(args.path_dev) if args.do_pred: assert args.path_test is not None assert os.path.exists(args.path_test) if args.grad_accum_steps < 1: raise ValueError( "Invalid grad_accum_steps parameter: {}, should be >= 1".format( args.grad_accum_steps)) # Create list of languages we handle lang_list = sorted(ALL_LANGS) # Seed RNGs np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Load tokenizer logger.info("Loading tokenizer...") tokenizer_path = os.path.join(args.dir_pretrained_model, "tokenizer.pkl") with open(tokenizer_path, "rb") as f: tokenizer = pickle.load(f) # Make encoder and model logger.info("Making encoder...") encoder_config = BertConfig.from_json_file( os.path.join(args.dir_pretrained_model, "config.json")) encoder = BertForMaskedLM(encoder_config) logger.info("Making model...") model = BertForLangID(encoder, lang_list) model.to(args.device) # Load model weights. First, check if we just have an encoder, or a previously fine-tuned model if "classifier.dense.weight" in checkpoint_data["model_state_dict"]: if "best_model_state_dict" in checkpoint_data and not args.resume: logger.info("Loading model weights from 'best_model_state_dict'") model.load_state_dict(checkpoint_data["best_model_state_dict"]) else: logger.info("Loading model weights from 'model_state_dict'") model.load_state_dict(checkpoint_data["model_state_dict"]) else: # Model has not previously been fine-tuned, so we only load encoder weights assert args.do_train logger.info("Loading encoder weights from 'model_state_dict'") model.encoder.load_state_dict(checkpoint_data["model_state_dict"]) if (args.do_train or args.resume) and args.freeze_encoder: model.freeze_encoder() # Write encoder config and tokenizer in output directory if (not args.resume) and args.do_train: path_config = os.path.join(args.dir_output, "config.json") model.encoder.config.to_json_file(path_config) path_tokenizer = os.path.join(args.dir_output, "tokenizer.pkl") with open(path_tokenizer, "wb") as f: pickle.dump(tokenizer, f) # Log some info on the model logger.info("Encoder config: %s" % repr(model.encoder.config)) logger.info("Model params:") for n, p in model.named_parameters(): msg = " %s" % n if not p.requires_grad: msg += " ***FROZEN***" logger.info(msg) logger.info("Nb model params: %d" % count_params(model)) logger.info("Nb params in encoder: %d" % count_params(model.encoder)) logger.info("Nb params in pooler: %d" % count_params(model.pooler)) logger.info("Nb params in classifier: %d" % count_params(model.classifier)) # Get data max_seq_length = args.seq_len + 2 # We add 2 for CLS and SEP if args.resume: # Reload training dataset(s) logger.info("Reloading training data from checkpoint") train_dataset = checkpoint_data["train_dataset"] train_dataset.prep_files_for_streaming() dev_dataset = checkpoint_data.get("dev_dataset", None) unk_dataset = checkpoint_data.get("unk_dataset", None) if unk_dataset: unk_dataset.prep_files_for_streaming() elif args.do_train: # Remove unk.train if present, and create a MLM dataset for it. path_unk = check_for_unk_train_data(train_paths) if path_unk is None: unk_dataset = None else: train_paths.remove(path_unk) logger.info("Creating MLM-only training set from %s..." % path_unk) unk_dataset = BertDatasetForMLM( [path_unk], tokenizer, max_seq_length, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed, verbose=DEBUG) logger.info("Creating training set from %s training files in %s..." % (len(train_paths), args.dir_train)) train_dataset = BertDatasetForClassification( train_paths, tokenizer, max_seq_length, include_mlm=True, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed, verbose=DEBUG) if path_unk is not None: assert len(unk_dataset) == len(train_dataset) # Check train_dataset.lang2id: keys should contain all langs, and nothing else, like that of the model assert train_dataset.lang2id == model.lang2id if not args.resume: dev_dataset = None if args.do_eval or args.eval_during_training: logger.info("Loading validation data from %s..." % args.path_dev) dev_dataset = BertDatasetForTesting(args.path_dev, tokenizer, model.lang2id, max_seq_length, require_labels=True, encoding="utf-8", verbose=DEBUG) if args.do_train and args.eval_during_training: checkpoint_data["dev_dataset"] = dev_dataset if args.do_pred: logger.info("Loading test data from %s..." % args.path_test) test_dataset = BertDatasetForTesting(args.path_test, tokenizer, model.lang2id, max_seq_length, require_labels=False, encoding="utf-8", verbose=DEBUG) # Compute number of epochs and steps, initialize number of training steps done. num_opt_steps_per_epoch = args.num_train_steps_per_epoch // args.grad_accum_steps args.num_epochs = math.ceil(checkpoint_data["max_opt_steps"] / num_opt_steps_per_epoch) if args.do_train and (not args.resume): checkpoint_data["global_step"] = 0 checkpoint_data[ "max_opt_steps"] = args.max_train_steps // args.grad_accum_steps # Training if args.do_train or args.resume: # Prepare optimizer logger.info("Preparing optimizer...") np_list = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] opt_params = [{ 'params': [p for n, p in np_list if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in np_list if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.equal_betas: betas = (0.9, 0.9) else: betas = (0.9, 0.999) optimizer = AdamW( opt_params, lr=args.learning_rate, betas=betas, correct_bias=args.correct_bias ) # To reproduce BertAdam specific behaviour, use correct_bias=False # Load optimizer state if resuming if args.resume: optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"]) # Log some info before training logger.info("*** Training info: ***") logger.info(" Number of training steps completed: %d" % checkpoint_data["global_step"]) logger.info(" Max training steps: %d" % args.max_train_steps) logger.info(" Gradient accumulation steps: %d" % args.grad_accum_steps) logger.info(" Max optimization steps: %d" % checkpoint_data["max_opt_steps"]) logger.info(" Training dataset size: %d" % len(train_dataset)) logger.info(" Batch size: %d" % args.train_batch_size) logger.info(" # training steps/epoch: %d" % (args.num_train_steps_per_epoch)) logger.info(" # optimization steps/epoch: %d" % num_opt_steps_per_epoch) logger.info(" # epochs to do: %d" % args.num_epochs) if args.eval_during_training: logger.info("Validation dataset size: %d" % len(dev_dataset)) # Run training train(model, optimizer, train_dataset, args, checkpoint_data, dev_dataset=dev_dataset, unk_dataset=unk_dataset) # Reload model save_to_dir = args.dir_pretrained_model if args.resume else args.dir_output checkpoint_data = torch.load( os.path.join(save_to_dir, "checkpoint.tar")) if "best_model_state_dict" in checkpoint_data: model.load_state_dict(checkpoint_data["best_model_state_dict"]) else: model.load_state_dict(checkpoint_data["model_state_dict"]) # Evaluate model on dev set if args.do_eval: logger.info("*** Running evaluation... ***") scores = evaluate(model, dev_dataset, args) logger.info("***** Evaluation Results *****") for score_name in sorted(scores.keys()): logger.info("- %s: %.4f" % (score_name, scores[score_name])) # Get model's predictions on test set if args.do_pred: logger.info("*** Running prediction... ***") logits = predict(model, test_dataset, args) pred_class_ids = np.argmax(logits.cpu().numpy(), axis=1) pred_labels = [test_dataset.label_list[i] for i in pred_class_ids] path_pred = os.path.join(args.dir_output, "pred.txt") logger.info("Writing predictions in %s..." % path_pred) with open(path_pred, 'w', encoding="utf-8") as f: for x in pred_labels: f.write("%s\n" % x)
def run(self): # load data args_dict = self._default_configs() args = dotdict(args_dict) feature_dirs, label_dirs = get_data(datadir, level, train_dataset, dev_dataset, test_dataset, mode) batchedData, maxTimeSteps, totalN = self.load_data( feature_dirs[0], label_dirs[0], mode, level) model = model_fn(args, maxTimeSteps) FL_pair = list(zip(feature_dirs, label_dirs)) random.shuffle(FL_pair) feature_dirs, label_dirs = zip(*FL_pair) print("Feature dirs:", feature_dirs) for feature_dir, label_dir in zip(feature_dirs, label_dirs): id_dir = feature_dirs.index(feature_dir) print('dir id:{}'.format(id_dir)) batchedData, maxTimeSteps, totalN = self.load_data( feature_dir, label_dir, mode, level) model = model_fn(args, maxTimeSteps) num_params = count_params(model, mode='trainable') all_num_params = count_params(model, mode='all') model.config['trainable params'] = num_params model.config['all params'] = all_num_params print(model.config) with tf.Session(graph=model.graph) as sess: # restore from stored model if keep == True: ckpt = tf.train.get_checkpoint_state(savedir) if ckpt and ckpt.model_checkpoint_path: model.saver.restore(sess, ckpt.model_checkpoint_path) print('Model restored from:' + savedir) else: print('Initializing') sess.run(model.initial_op) total_cont = 0 for epoch in range(num_epochs): ## training start = time.time() if mode == 'train': print('Epoch {} ...'.format(epoch + 1)) batchErrors = np.zeros(len(batchedData)) batchRandIxs = np.random.permutation(len(batchedData)) for batch, batchOrigI in enumerate(batchRandIxs): batchInputs, batchTargetSparse, batchSeqLengths = batchedData[ batchOrigI] batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse feedDict = { model.inputX: batchInputs, model.targetIxs: batchTargetIxs, model.targetVals: batchTargetVals, model.targetShape: batchTargetShape, model.seqLengths: batchSeqLengths } _, l, pre, y, er = sess.run([ model.optimizer, model.loss, model.predictions, model.targetY, model.errorRate ], feed_dict=feedDict) batchErrors[batch] = er print( '\n{} mode, total:{},subdir:{}/{},batch:{}/{},epoch:{}/{},train loss={:.3f},mean train CER={:.3f}\n' .format(level, totalN, id_dir + 1, len(feature_dirs), batch + 1, len(batchRandIxs), epoch + 1, num_epochs, l, er / batch_size)) total_cont += 1 if batch % 20 == 0: print('Truth:\n' + output_to_sequence(y, type=level)) print('Output:\n' + output_to_sequence(pre, type=level)) checkpoint_path = os.path.join( savedir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step=total_cont) print('Model has been saved in {}'.format(savedir)) end = time.time() delta_time = end - start print('Epoch ' + str(epoch + 1) + ' needs time:' + str(delta_time) + ' s')
def train(self, args): ''' import data, train model, save model ''' text_parser = TextParser(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = text_parser.vocab_size ckpt = tf.train.get_checkpoint_state(args.save_dir) if args.keep is True: # check if all necessary files exist if os.path.exists(os.path.join(args.save_dir,'config.pkl')) and \ os.path.exists(os.path.join(args.save_dir,'words_vocab.pkl')) and \ ckpt and ckpt.model_checkpoint_path: with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) else: raise ValueError('configuration doesn"t exist!') if args.model == 'seq2seq_rnn': model = Model_rnn(args) else: # TO ADD OTHER MODEL pass trainable_num_params = count_params(model, mode='trainable') all_num_params = count_params(model, mode='all') args.num_trainable_params = trainable_num_params args.num_all_params = all_num_params print(args.num_trainable_params) print(args.num_all_params) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((text_parser.vocab_dict, text_parser.vocab_list), f) with tf.Session() as sess: if args.keep is True: print('Restoring') model.saver.restore(sess, ckpt.model_checkpoint_path) else: print('Initializing') sess.run(model.initial_op) for e in range(args.num_epochs): start = time.time() #sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) sess.run(tf.assign(model.lr, args.learning_rate)) model.initial_state = tf.convert_to_tensor(model.initial_state) state = model.initial_state.eval() total_loss = [] for b in range(text_parser.num_batches): x, y = text_parser.next_batch() print('flag') feed = { model.input_data: x, model.targets: y, model.initial_state: state } train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) total_loss.append(train_loss) print("{}/{} (epoch {}), train_loss = {:.3f}" \ .format(e * text_parser.num_batches + b, \ args.num_epochs * text_parser.num_batches, \ e, train_loss)) if (e * text_parser.num_batches + b) % args.save_every == 0 or ( e == args.num_epochs - 1 and b == text_parser.num_batches - 1): checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step=e) print("model has been saved in:" + str(checkpoint_path)) end = time.time() delta_time = end - start ave_loss = np.array(total_loss).mean() logging(model, ave_loss, e, delta_time, mode='train') if ave_loss < 0.5: break
def main(): args = parse_args() # add model name to args if args.name is None: args.name = '%s_%s' % (args.arch, datetime.now().strftime('%m%d%H%M')) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') # define loss function (criterion) if args.loss == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda(args.gpu) else: criterion = losses.__dict__[args.loss]().cuda(args.gpu) cudnn.benchmark = True # Data loading code train_df = pd.read_csv('input/train.csv') img_paths = 'input/train/images/' + train_df['id'].values + '.png' mask_paths = 'input/train/masks/' + train_df['id'].values + '.png' if args.cv == 'KFold': kf = KFold(n_splits=args.n_splits, shuffle=True, random_state=41) cv = kf.split(img_paths) elif args.cv == 'Cov': train_df['cov'] = 0 for i in tqdm(range(len(train_df))): mask = imread('input/train/masks/' + train_df['id'][i] + '.png') mask = mask.astype('float32') / 255 train_df.loc[i, 'cov'] = ((np.sum(mask > 0.5) / 101**2) * 10).astype('int') skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) cv = skf.split(img_paths, train_df['cov']) for fold, (train_idx, val_idx) in enumerate(cv): print('Fold [%d/%d]' % (fold + 1, args.n_splits)) # create model print("=> creating model %s (pretrained=%s)" % (args.arch, str(args.pretrained))) model = archs.__dict__[args.arch](args) if args.freeze_bn: model.freeze_bn() if args.gpu is not None: model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() print(count_params(model)) if args.optimizer == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.scheduler == 'MultiStepLR': if args.reduce_epoch is None: scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=0.1) else: scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[args.reduce_epoch], gamma=0.1) elif args.scheduler == 'CyclicLR': scheduler = CyclicLR(optimizer, step_size=800) elif args.scheduler == 'CosineAnnealingLR': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=args.min_lr) elif args.scheduler == 'StepLR': scheduler = lr_scheduler.StepLR(optimizer, 20, gamma=0.5) train_img_paths, val_img_paths = img_paths[train_idx], img_paths[ val_idx] train_mask_paths, val_mask_paths = mask_paths[train_idx], mask_paths[ val_idx] train_dataset = Dataset(args, train_img_paths, train_mask_paths) val_dataset = Dataset(args, val_img_paths, val_mask_paths, False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) log = pd.DataFrame( index=[], columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) best_loss = float('inf') trigger = 0 for epoch in range(args.epochs): if args.scheduler == 'CyclicLR': # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch, scheduler) else: scheduler.step() # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_log = validate(args, val_loader, model, criterion) tmp = pd.Series( [ epoch, scheduler.get_lr()[0], train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'], ], index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) log = log.append(tmp, ignore_index=True) log.to_csv('models/%s/log_%d.csv' % (args.name, fold + 1), index=False) trigger += 1 if val_log['loss'] < best_loss: torch.save(model.state_dict(), 'models/%s/model_%d.pth' % (args.name, fold + 1)) best_loss = val_log['loss'] print("=> saved best model") trigger = 0 # early stopping if not args.early_stop is None: if epoch > args.epochs * 0.8 and trigger >= args.early_stop: print("=> early stopping") break torch.cuda.empty_cache()
def run(self): # load data args_dict = self._default_configs() args = dotdict(args_dict) # 创建dotdict类,类似创造自己的dict feature_dirs, label_dirs = get_data(datadir, level, train_dataset, dev_dataset, test_dataset, mode) # batchedData, maxTimeSteps, totalN = self.load_data(feature_dirs[0], label_dirs[0], mode, level) # model = model_fn(args, maxTimeSteps) # # 此两行作用不明白,删掉后不知道有什么影响 # 记录每次epoch的 # shuffle feature_dir and label_dir by same order FL_pair = list(zip(feature_dirs, label_dirs)) # zip()后返回特定zip数据?,list让其变成列表 random.shuffle(FL_pair) # 打乱列表中元素顺序 feature_dirs, label_dirs = zip(*FL_pair) for feature_dir, label_dir in zip( feature_dirs, label_dirs): # zip()返回结果可用于for, 展示时用list()展出 id_dir = feature_dirs.index(feature_dir) print('dir id:{}'.format(id_dir)) batchedData, maxTimeSteps, totalN = self.load_data( feature_dir, label_dir, mode, level) model = model_fn(args, maxTimeSteps) # 建立神经网络的图 num_params = count_params(model, mode='trainable') all_num_params = count_params(model, mode='all') model.config['trainable params'] = num_params model.config['all params'] = all_num_params print(model.config) with tf.Session(graph=model.graph, config=config) as sess: # restore from stored model if keep: # 用于重新训练 keep == True ckpt = tf.train.get_checkpoint_state(savedir) # Returns CheckpointState proto from the "checkpoint" file. if ckpt and ckpt.model_checkpoint_path: # The checkpoint file model.saver.restore(sess, ckpt.model_checkpoint_path) print('Model restored from:' + savedir) else: print('Initializing') sess.run(model.initial_op) for step in range(num_steps): # training start = time.time() if mode == 'train': print('step {} ...'.format(step + 1)) batchErrors = np.zeros(len(batchedData)) batchRandIxs = np.random.permutation(len(batchedData)) # 如果传给permutation一个矩阵,它会返回一个洗牌后的矩阵副本 for batch, batchOrigI in enumerate(batchRandIxs): # 对于一个可迭代的(iterable)/可遍历的对象(如列表、字符串),enumerate将其组成一个索引序列, # 利用它可以同时获得索引和值 这部分代码用于feed_Dict batchInputs, batchTargetSparse, batchSeqLengths = batchedData[ batchOrigI] batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse feedDict = { model.inputX: batchInputs, model.targetIxs: batchTargetIxs, model.targetVals: batchTargetVals, model.targetShape: batchTargetShape, model.seqLengths: batchSeqLengths } if level == 'cha': if mode == 'train': _, l, pre, y, er = sess.run([ model.optimizer, model.loss, model.predictions, model.targetY, model.errorRate ], feed_dict=feedDict) batchErrors[ batch] = er # batchError 207 batch 211 print( '\n{} mode, total:{},subdir:{}/{},batch:{}/{},step:{},train loss={:.3f},mean ' 'train CER={:.3f}, epoch: {}\n'.format( level, totalN, id_dir + 1, len(feature_dirs), batch + 1, len(batchRandIxs), step + 1, l, er / batch_size, num_epochs)) elif mode == 'dev': l, pre, y, er = sess.run([ model.loss, model.predictions, model.targetY, model.errorRate ], feed_dict=feedDict) batchErrors[batch] = er print( '\n{} mode, total:{},subdir:{}/{},batch:{}/{},dev loss={:.3f},' 'mean dev CER={:.3f}\n'.format( level, totalN, id_dir + 1, len(feature_dirs), batch + 1, len(batchRandIxs), l, er / batch_size)) elif mode == 'test': l, pre, y, er = sess.run([ model.loss, model.predictions, model.targetY, model.errorRate ], feed_dict=feedDict) batchErrors[batch] = er print( '\n{} mode, total:{},subdir:{}/{},batch:{}/{},test loss={:.3f},' 'mean test CER={:.3f}\n'.format( level, totalN, id_dir + 1, len(feature_dirs), batch + 1, len(batchRandIxs), l, er / batch_size)) elif level == 'seq2seq': raise ValueError('level %s is not supported now' % str(level)) # NOTE: ??????for what # if er / batch_size == 1.0: # break if batch % 20 == 0: print('Truth:\n' + output_to_sequence(y, type=level)) print('Output:\n' + output_to_sequence(pre, type=level)) if mode == 'train' and ( (step * len(batchRandIxs) + batch + 1) % 20 == 0 or (step == num_steps - 1 and batch == len(batchRandIxs) - 1)): # 每当算式结果是20倍数 或者 跑完一个 subdir的 batch后, 记录model checkpoint_path = os.path.join( savedir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step=step) print('Model has been saved in {}'.format(savedir)) end = time.time() delta_time = end - start print('subdir ' + str(id_dir + 1) + ' needs time:' + str(delta_time) + ' s') if mode == 'train': if (step + 1) % 1 == 0: checkpoint_path = os.path.join( savedir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step=step) print('Model has been saved in {}'.format(savedir)) epochER = batchErrors.sum() / totalN print('subdir', id_dir + 1, 'mean train error rate:', epochER) # 修改epoch成subdir logging(model, logfile, epochER, id_dir, delta_time, mode='config') logging(model, logfile, epochER, id_dir, delta_time, mode=mode) if mode == 'test' or mode == 'dev': with open( os.path.join(resultdir, level + '_result.txt'), 'a') as result: result.write( output_to_sequence(y, type=level) + '\n') result.write( output_to_sequence(pre, type=level) + '\n') result.write('\n') epochER = batchErrors.sum() / totalN print(' test error rate:', epochER) logging(model, logfile, epochER, mode=mode)
def main(): args = parse_args() if args.name is None: if args.deepsupervision: args.name = '%s_%s_wDS' % (args.dataset, args.arch) else: args.name = '%s_%s_woDS' % (args.dataset, args.arch) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) writer = SummaryWriter('models/%s/test' % args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) # define loss function (criterion) if args.loss == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = losses.__dict__[args.loss]().cuda() cudnn.benchmark = True # Data loading code img_paths = glob('input/' + args.dataset + '/images/*') mask_paths = glob('input/' + args.dataset + '/masks/*') train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \ train_test_split(img_paths, mask_paths, test_size=0.2, random_state=41) # create model print("=> creating model %s" % args.arch) model = archs.__dict__[args.arch](args) model = model.cuda() # print(type(model)) ######## model visualization in tensorboard ############## dummy_input = torch.rand(1, 3, 256, 256).cuda() writer.add_graph(model, (dummy_input, )) print(count_params(model)) if args.optimizer == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug) val_dataset = Dataset(args, val_img_paths, val_mask_paths) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) log = pd.DataFrame( index=[], columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) best_iou = 0 trigger = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch, args.epochs)) # changing lr adjust_learning_rate(args, optimizer, epoch, writer) # optimizer # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_log = validate(args, val_loader, model, criterion) print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' % (train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'])) # vis in tensorboard writer.add_scalar('train_loss', train_log['loss'], epoch) writer.add_scalar('train_iou', train_log['iou'], epoch) writer.add_scalar('val_loss', val_log['loss'], epoch) writer.add_scalar('val_iou', val_log['iou'], epoch) tmp = pd.Series( [ epoch, args.lr, train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'], ], index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) log = log.append(tmp, ignore_index=True) log.to_csv('models/%s/log.csv' % args.name, index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save(model.state_dict(), 'models/%s/model.pth' % args.name) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 #todo: save model with best dice # early stopping if not args.early_stop is None: if trigger >= args.early_stop: print("=> early stopping") break torch.cuda.empty_cache()
CHAR_EMBEDDING_MATRIX = model.add_lookup_parameters( (num_chars, char_embedding_size)) word_lstm = dy.LSTMBuilder(num_layers, embedding_size, lstm_out, model) char_fw_lstm = dy.LSTMBuilder(1, char_embedding_size, char_lstm_out, model) char_bw_lstm = dy.LSTMBuilder(1, char_embedding_size, char_lstm_out, model) softmax_w = model.add_parameters((num_words, lstm_out)) softmax_b = model.add_parameters((num_words)) params = [CHAR_EMBEDDING_MATRIX, softmax_w, softmax_b] params.extend(*word_lstm.get_parameters()) params.extend(*char_fw_lstm.get_parameters()) params.extend(*char_bw_lstm.get_parameters()) print("Number of Params: {}".format(count_params(params))) def word_rep(word, fw_init, bw_init): pad = char_vocab.get('<*>') indices = [pad] + [char_vocab.get(c) for c in word] + [pad] embedded = [CHAR_EMBEDDING_MATRIX[i] for i in indices] forward = fw_init.transduce(embedded) backward = bw_init.transduce(embedded) return dy.concatenate([forward[-1], backward[-1]]) def calc_lm_loss(words): dy.renew_cg() sm_w = dy.parameter(softmax_w)
def evaluate(SMASH, which_dataset, batch_size, seed, validate, num_random, num_perturb, num_markov, perturb_prob, arch_SGD, fp16, parallel): # Random seeds torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) num_runs = num_random + num_perturb + num_markov random_sample = True perturb = False markov = False net = torch.load('weights/' + SMASH + '.pth') net.eval() # Backwards compatibility hack; If you're trying to understand this code, # ignore this line. if not hasattr(net, 'factors'): net.factors = factors(net.N) _, test_loader = get_data_loader(which_dataset=which_dataset, augment=False, validate=validate, batch_size=batch_size) # Prepare lists that hold errors ensemble_err, err, flops, params = [], [], [], [] # Array to which we save configurations and errors save_archs = [] # Prepare ensemble predictions ensemble_out = torch.zeros(len(test_loader.dataset), net.fc.out_features).cuda() # Start the stopwatch and begin testing start_time = time.time() mode = 'training' if net.training else 'testing' print('Evaluating %s in %s mode...' % (SMASH, mode)) for test in range(num_runs): # If we've done all our random samples, switch to random perturbation mode if test == num_random: sorted_archs = sorted(save_archs, key=lambda item: item[-1]) print( 'Random sampling complete with best error of %f, starting perturbation...' % (sorted_archs[0][-1])) base_arch = sorted_archs[0][:10] perturb = True random_sample = False # If we've done all our perturbations, switch to markov chain mode elif test == num_random + num_perturb: sorted_archs = sorted(save_archs, key=lambda item: item[-1]) print( 'Random perturbation complete with best error of %f, starting markov chain...' % (sorted_archs[0][-1])) base_arch = sorted_archs[0][:10] current_error = sorted_archs[0][-1] markov = True # Sample a random architecture, as in training if random_sample: arch = net.sample_architecture() # Slightly change a sampled (and, presumably, high-scoring) architecture elif perturb: arch = perturb_arch.perturb_architecture(net, deepcopy(base_arch), perturb_prob) #Sample Weights w1x1 = net.sample_weights(*arch) # Error counters e, ensemble_e = 0, 0 # Loop over validation set for i, (x, y) in enumerate(test_loader): # Get outputs o = net(V(x.cuda(), volatile=True), w1x1, *arch) # Get predictions ensembled across multiple configurations ensemble_out[i * batch_size:(i + 1) * batch_size] += o.data # Update error e += o.data.max(1)[1].cpu().ne(y).sum() # Update ensemble error ensemble_e += ensemble_out[i * batch_size:(i + 1) * batch_size].max(1)[1].cpu().ne(y).sum() # Save ensemble error thus far ensemble_err.append(float(ensemble_e) / ensemble_out.size(0)) # Save individual error thus far err.append(float(e) / ensemble_out.size(0)) # While in markov mode, update the base arch if we get a better SMAS hscore. if markov and err[-1] < float(current_error): print( 'Error of %f superior to error of %f, accepting new architecture...' % (err[-1], current_error)) base_arch = arch current_error = err[-1] # Save relevant architectural details along with error save_archs.append(arch + (net.N, net.N_max, net.bottleneck, net.max_bottleneck, net.in_channels, 0, err[-1])) params.append(count_params(save_archs[-1])) flops.append(count_flops(save_archs[-1], which_dataset)) print( 'For run #%d/%d, Individual Error %2.2f Ensemble Err %2.2f, params %e, flops %e, Time Elapsed %d.' % (test, num_runs, 100 * err[-1], 100 * ensemble_err[-1], params[-1], flops[-1], time.time() - start_time) ) #LogSof EnsErr %d, Softmax EnsErr %d ensemble_olgs_err[-1], ensemble_os_err[-1], best_acc = sorted(err)[0] worst_acc = sorted(err)[-1] least_flops = sorted(flops)[0] most_flops = sorted(flops)[-1] least_params = sorted(params)[0] most_params = sorted(params)[-1] print('Best accuracy is ' + str(best_acc) + ', Worst accuracy is ' + str(worst_acc)) # Save results # np.savez(filename[:-4] + '_' + mode + '_errors.npz', **{'err':err, 'ensemble_err':ensemble_err}) # save_archs = sorted(save_archs, key = lambda item: item[-1]) np.savez( SMASH + '_archs.npz', **{ 'archs': sorted(save_archs, key=lambda item: item[-1]), 'unsorted_archs': save_archs })
def __init__(self, model, vocab): assert isinstance(model, dict) or isinstance(model, str) assert isinstance(vocab, tuple) or isinstance(vocab, str) # dataset logger.info('-' * 100) logger.info('Loading training and validation dataset') self.dataset = data.CodePtrDataset(mode='test') self.dataset_size = len(self.dataset) logger.info('Size of training dataset: {}'.format(self.dataset_size)) logger.info('The dataset are successfully loaded') self.dataloader = DataLoader(dataset=self.dataset, batch_size=config.test_batch_size, collate_fn=lambda *args: utils.collate_fn(args, source_vocab=self.source_vocab, code_vocab=self.code_vocab, ast_vocab=self.ast_vocab, nl_vocab=self.nl_vocab, raw_nl=True)) # vocab logger.info('-' * 100) if isinstance(vocab, tuple): logger.info('Vocabularies are passed from parameters') assert len(vocab) == 4 self.source_vocab, self.code_vocab, self.ast_vocab, self.nl_vocab = vocab else: logger.info('Vocabularies are read from dir: {}'.format(vocab)) self.source_vocab = utils.load_vocab(vocab, 'source') self.code_vocab = utils.load_vocab(vocab, 'code') self.ast_vocab = utils.load_vocab(vocab, 'ast') self.nl_vocab = utils.load_vocab(vocab, 'nl') # vocabulary self.source_vocab_size = len(self.source_vocab) self.code_vocab_size = len(self.code_vocab) self.ast_vocab_size = len(self.ast_vocab) self.nl_vocab_size = len(self.nl_vocab) logger.info('Size of source vocabulary: {} -> {}'.format(self.source_vocab.origin_size, self.source_vocab_size)) logger.info('Size of code vocabulary: {} -> {}'.format(self.code_vocab.origin_size, self.code_vocab_size)) logger.info('Size of ast vocabulary: {}'.format(self.ast_vocab_size)) logger.info('Size of nl vocabulary: {} -> {}'.format(self.nl_vocab.origin_size, self.nl_vocab_size)) logger.info('Vocabularies are successfully built') # model logger.info('-' * 100) logger.info('Building model') self.model = models.Model(source_vocab_size=self.source_vocab_size, code_vocab_size=self.code_vocab_size, ast_vocab_size=self.ast_vocab_size, nl_vocab_size=self.nl_vocab_size, is_eval=True, model=model) # model device logger.info('Model device: {}'.format(next(self.model.parameters()).device)) # log model statistic logger.info('Trainable parameters: {}'.format(utils.human_format(utils.count_params(self.model))))
def run(self): # load data args_dict = self._default_configs() args = dotdict(args_dict) batchedData, maxTimeSteps, totalN = self.load_data(args, mode=mode, type=level) model = model_fn(args, maxTimeSteps) # count the num of params num_params = count_params(model, mode='trainable') all_num_params = count_params(model, mode='all') model.config['trainable params'] = num_params model.config['all params'] = all_num_params print(model.config) #with tf.Session(graph=model.graph) as sess: with tf.Session() as sess: # restore from stored model if keep == True: ckpt = tf.train.get_checkpoint_state(savedir) if ckpt and ckpt.model_checkpoint_path: model.saver.restore(sess, ckpt.model_checkpoint_path) print('Model restored from:' + savedir) else: print('Initializing') sess.run(model.initial_op) for epoch in range(num_epochs): ## training start = time.time() if mode == 'train': print('Epoch', epoch + 1, '...') batchErrors = np.zeros(len(batchedData)) batchRandIxs = np.random.permutation(len(batchedData)) for batch, batchOrigI in enumerate(batchRandIxs): batchInputs, batchTargetSparse, batchSeqLengths = batchedData[ batchOrigI] batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse feedDict = { model.inputX: batchInputs, model.targetIxs: batchTargetIxs, model.targetVals: batchTargetVals, model.targetShape: batchTargetShape, model.seqLengths: batchSeqLengths } if level == 'cha': if mode == 'train': _, l, pre, y, er = sess.run([ model.optimizer, model.loss, model.predictions, model.targetY, model.errorRate ], feed_dict=feedDict) batchErrors[batch] = er print( '\n{} mode, total:{},batch:{}/{},epoch:{}/{},train loss={:.3f},mean train CER={:.3f}\n' .format(level, totalN, batch + 1, len(batchRandIxs), epoch + 1, num_epochs, l, er / batch_size)) elif mode == 'test': l, pre, y, er = sess.run([ model.loss, model.predictions, model.targetY, model.errorRate ], feed_dict=feedDict) batchErrors[batch] = er print( '\n{} mode, total:{},batch:{}/{},test loss={:.3f},mean test CER={:.3f}\n' .format(level, totalN, batch + 1, len(batchRandIxs), l, er / batch_size)) elif level == 'phn': if mode == 'train': _, l, pre, y = sess.run([ model.optimizer, model.loss, model.predictions, model.targetY ], feed_dict=feedDict) er = get_edit_distance([pre.values], [y.values], True, level) print( '\n{} mode, total:{},batch:{}/{},epoch:{}/{},train loss={:.3f},mean train PER={:.3f}\n' .format(level, totalN, batch + 1, len(batchRandIxs), epoch + 1, num_epochs, l, er)) batchErrors[batch] = er * len(batchSeqLengths) elif mode == 'test': l, pre, y = sess.run( [model.loss, model.predictions, model.targetY], feed_dict=feedDict) er = get_edit_distance([pre.values], [y.values], True, level) print( '\n{} mode, total:{},batch:{}/{},test loss={:.3f},mean test PER={:.3f}\n' .format(level, totalN, batch + 1, len(batchRandIxs), l, er)) batchErrors[batch] = er * len(batchSeqLengths) # NOTE: if er / batch_size == 1.0: break if batch % 30 == 0: print('Truth:\n' + output_to_sequence(y, type=level)) print('Output:\n' + output_to_sequence(pre, type=level)) if mode == 'train' and ( (epoch * len(batchRandIxs) + batch + 1) % 20 == 0 or (epoch == num_epochs - 1 and batch == len(batchRandIxs) - 1)): checkpoint_path = os.path.join(savedir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step=epoch) print('Model has been saved in {}'.format(savedir)) end = time.time() delta_time = end - start print('Epoch ' + str(epoch + 1) + ' needs time:' + str(delta_time) + ' s') if mode == 'train': if (epoch + 1) % 1 == 0: checkpoint_path = os.path.join(savedir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step=epoch) print('Model has been saved in {}'.format(savedir)) epochER = batchErrors.sum() / totalN print('Epoch', epoch + 1, 'mean train error rate:', epochER) logging(model, logfile, epochER, epoch, delta_time, mode='config') logging(model, logfile, epochER, epoch, delta_time, mode=mode) if mode == 'test': with open(os.path.join(resultdir, level + '_result.txt'), 'a') as result: result.write(output_to_sequence(y, type=level) + '\n') result.write( output_to_sequence(pre, type=level) + '\n') result.write('\n') epochER = batchErrors.sum() / totalN print(' test error rate:', epochER) logging(model, logfile, epochER, mode=mode)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model_or_config_file", default=None, type=str, required=True, help=( "Path of configuration file (if starting from scratch) or directory" " containing checkpoint (if resuming) or directory containig a" " pretrained model and tokenizer (if re-training).")) # Use for resuming from checkpoint parser.add_argument("--resume", action='store_true', help="Resume from checkpoint") # Required if not resuming parser.add_argument( "--dir_train_data", type=str, help= "Path of a directory containing training files (names must all match <lang>.train)" ) parser.add_argument( "--path_vocab", type=str, help= "Path of a 2-column TSV file containing the vocab of chars and their frequency." ) parser.add_argument( "--output_dir", type=str, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--mlm_only", action="store_true", help= ("Use only masked language modeling, no sentence pair classification " " (e.g. if you only have unk.train in your training directory)")) parser.add_argument( "--avgpool_for_spc", action="store_true", help= ("Use average pooling of all last hidden states, rather than just the last hidden state of CLS, to do SPC. " "Note that in either case, the pooled vector passes through a square linear layer and a tanh before the classification layer." )) parser.add_argument( "--sampling_alpha", type=float, default=1.0, help= "Dampening factor for relative frequencies used to compute language sampling probabilities" ) parser.add_argument( "--weight_relevant", type=float, default=1.0, help= "Relative sampling frequency of relevant languages wrt irrelevant languages" ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( "--seq_len", default=128, type=int, help= "Length of input sequences. Shorter seqs are padded, longer ones are trucated" ) parser.add_argument( "--min_freq", default=1, type=int, help= "Minimum character frequency. Characters whose frequency is under this threshold will be mapped to <UNK>" ) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for AdamW optimizer.") parser.add_argument("--equal_betas", action='store_true', help="Use beta1=beta2=0.9 for AdamW optimizer.") parser.add_argument( "--correct_bias", action='store_true', help= "Correct bias in AdamW optimizer (correct_bias=False is meant to reproduce BERT behaviour exactly." ) parser.add_argument( "--max_train_steps", default=1000000, type=int, help= "Maximum number of training steps to perform. Note: # optimization steps = # train steps / # accumulation steps." ) parser.add_argument( "--num_train_steps_per_epoch", default=1000, type=int, help= "Number of training steps that equals one epoch. Note: # optimization steps = # train steps / # accumulation steps." ) parser.add_argument( "--num_warmup_steps", default=10000, type=int, help= "Number of optimization steps (i.e. training steps / accumulation steps) to perform linear learning rate warmup for. " ) parser.add_argument( '--grad_accum_steps', type=int, default=1, help= "Number of training steps (i.e. batches) to accumualte before performing a backward/update pass." ) parser.add_argument( "--num_gpus", type=int, default=-1, help="Num GPUs to use for training (0 for none, -1 for all available)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() # These args are required if we are not resuming from checkpoint if not args.resume: assert args.dir_train_data is not None assert args.path_vocab is not None assert args.output_dir is not None # Check whether we are starting from scratch, resuming from a checkpoint, or retraining a pretrained model from_scratch = (not args.resume) and (not os.path.isdir( args.bert_model_or_config_file)) retraining = (not args.resume) and (not from_scratch) # Load config. Load or create checkpoint data. if from_scratch: logger.info("***** Starting pretraining job from scratch *******") config = BertConfig.from_json_file(args.bert_model_or_config_file) checkpoint_data = {} elif retraining: logger.info( "***** Starting pretraining job from pre-trained model *******") logger.info("Loading pretrained model...") model = BertModelForMaskedLM.from_pretrained( args.bert_model_or_config_file) config = model.config checkpoint_data = {} elif args.resume: logger.info("***** Resuming pretraining job *******") logger.info("Loading checkpoint...") checkpoint_path = os.path.join(args.bert_model_or_config_file, "checkpoint.tar") checkpoint_data = torch.load(checkpoint_path) # Make sure we haven't already done the maximum number of optimization steps if checkpoint_data["global_step"] >= checkpoint_data["max_opt_steps"]: msg = "We have already done %d optimization steps." % checkpoint_data[ "global_step"] raise RuntimeError(msg) logger.info("Resuming from global step %d" % checkpoint_data["global_step"]) # Replace args with initial args for this job, except for num_gpus, seed and model directory current_num_gpus = args.num_gpus current_seed = args.seed checkpoint_dir = args.bert_model_or_config_file args = deepcopy(checkpoint_data["initial_args"]) args.num_gpus = current_num_gpus args.seed = current_seed args.bert_model_or_config_file = checkpoint_dir args.resume = True logger.info("Args (most have been reloaded from checkpoint): %s" % args) # Load config config_path = os.path.join(args.bert_model_or_config_file, "config.json") config = BertConfig.from_json_file(config_path) # Check args assert args.sampling_alpha >= 0 and args.sampling_alpha <= 1 assert args.weight_relevant > 0 if args.grad_accum_steps < 1: raise ValueError( "Invalid grad_accum_steps parameter: {}, should be >= 1".format( args.grad_accum_steps)) train_paths = glob.glob(os.path.join(args.dir_train_data, "*.train")) assert len(train_paths) > 0 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if (not args.resume) and len(os.listdir(args.output_dir)) > 0: msg = "Directory %s is not empty" % args.output_dir raise ValueError(msg) # Make or load tokenizer if args.resume or retraining: logger.info("Loading tokenizer...") tokenizer_path = os.path.join(args.bert_model_or_config_file, "tokenizer.pkl") with open(tokenizer_path, "rb") as f: tokenizer = pickle.load(f) elif from_scratch: logger.info("Making tokenizer...") assert os.path.exists(args.path_vocab) tokenizer = CharTokenizer(args.path_vocab) if args.min_freq > 1: tokenizer.trim_vocab(args.min_freq) # Adapt vocab size in config config.vocab_size = len(tokenizer.vocab) # Save tokenizer fn = os.path.join(args.output_dir, "tokenizer.pkl") with open(fn, "wb") as f: pickle.dump(tokenizer, f) logger.info("Size of vocab: {}".format(len(tokenizer.vocab))) # Copy config in output directory if not args.resume: config_path = os.path.join(args.output_dir, "config.json") config.to_json_file(config_path) # What GPUs do we use? if args.num_gpus == -1: args.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() device_ids = None else: args.device = torch.device("cuda" if torch.cuda.is_available() and args.num_gpus > 0 else "cpu") args.n_gpu = args.num_gpus if args.n_gpu > 1: device_ids = list(range(args.n_gpu)) if args.local_rank != -1: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}".format( args.device, args.n_gpu, bool(args.local_rank != -1))) # Seed RNGs np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Prepare model if from_scratch or args.resume: model = BertForMaskedLM(config) if args.resume: model.load_state_dict(checkpoint_data["model_state_dict"]) model.to(args.device) # Prepare pooler (if we are doing SPC) if args.mlm_only: pooler = None else: pooler = Pooler(model.config.hidden_size, cls_only=(not args.avgpool_for_spc)) if args.resume: pooler.load_state_dict(checkpoint_data["pooler_state_dict"]) pooler.to(args.device) # Distributed or parallel? if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed training." ) model = DDP(model) if not args.mlm_only: pooler = DDP(pooler) elif args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=device_ids) pooler = torch.nn.DataParallel(pooler, device_ids=device_ids) # Log some info on the model logger.info("Model config: %s" % repr(model.config)) logger.info("Nb params: %d" % count_params(model)) if not args.mlm_only: logger.info("Nb params in pooler: %d" % count_params(pooler)) # Check if there is unk training data. path_unk = check_for_unk_train_data(train_paths) # Get training data max_seq_length = args.seq_len + 2 # We add 2 for CLS and SEP logger.info("Preparing dataset using data from %s" % args.dir_train_data) if args.mlm_only: # We only want to do MLM train_dataset_spc = None train_dataset_mlm = BertDatasetForMLM( train_paths, tokenizer, max_seq_length, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed) else: # We want do to SLC and MLM. If unk data is present, we remove # it from the paths provided to BertLabeledDataset. if path_unk is not None: train_paths.remove(path_unk) train_dataset_spc = BertDatasetForSPCAndMLM( train_paths, tokenizer, max_seq_length, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed) if path_unk is None: train_dataset_mlm = None else: # In this case we use a BertDatasetForMLM for the unk # data. Both datasets will be of the same size. The latter # is used for MLM only. train_dataset_mlm = BertDatasetForMLM( [path_unk], tokenizer, max_seq_length, sampling_alpha=args.sampling_alpha, weight_relevant=args.weight_relevant, encoding="utf-8", seed=args.seed) assert len(train_dataset_spc) == len(train_dataset_mlm) # Check length of dataset dataset_length = len( train_dataset_spc) if train_dataset_spc is not None else len( train_dataset_mlm) # Store optimization steps performed and maximum number of optimization steps if not args.resume: checkpoint_data["global_step"] = 0 checkpoint_data[ "max_opt_steps"] = args.max_train_steps // args.grad_accum_steps # Compute number of optimization steps per epoch num_opt_steps_per_epoch = args.num_train_steps_per_epoch // args.grad_accum_steps # Compute number of epochs necessary to reach the maximum number of optimization steps opt_steps_left = checkpoint_data["max_opt_steps"] - checkpoint_data[ "global_step"] args.num_epochs = math.ceil(opt_steps_left / num_opt_steps_per_epoch) # Log some info before training logger.info("*** Training info: ***") logger.info("Max training steps: %d" % args.max_train_steps) logger.info("Gradient accumulation steps: %d" % args.grad_accum_steps) logger.info("Max optimization steps: %d" % checkpoint_data["max_opt_steps"]) if args.resume: logger.info("Nb optimization steps done so far: %d" % checkpoint_data["global_step"]) logger.info("Total dataset size: %d examples" % (dataset_length)) logger.info("Batch size: %d" % args.train_batch_size) logger.info("# training steps/epoch: %d" % (args.num_train_steps_per_epoch)) logger.info("# optimization steps/epoch: %d" % (num_opt_steps_per_epoch)) logger.info("# epochs to do: %d" % (args.num_epochs)) # Prepare optimizer logger.info("Preparing optimizer...") np_list = list(model.named_parameters()) if not args.mlm_only: np_list += list(pooler.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] opt_params = [{ 'params': [p for n, p in np_list if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in np_list if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.equal_betas: betas = (0.9, 0.9) else: betas = (0.9, 0.999) optimizer = AdamW( opt_params, lr=args.learning_rate, betas=betas, correct_bias=args.correct_bias ) # To reproduce BertAdam specific behaviour, use correct_bias=False if args.resume: optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"]) # Prepare scheduler logger.info("Preparing learning rate scheduler...") scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=checkpoint_data["max_opt_steps"]) if args.resume: scheduler.load_state_dict(checkpoint_data["scheduler_state_dict"]) logger.info("Current learning rate: %f" % scheduler.get_last_lr()[0]) # Save initial training args if not args.resume: checkpoint_data["initial_args"] = args # Prepare training log time_str = datetime.now().strftime("%Y%m%d%H%M%S") train_log_path = os.path.join(args.output_dir, "%s.train.log" % time_str) args.train_log_path = train_log_path # Train if args.mlm_only: train(model, None, tokenizer, optimizer, scheduler, train_dataset_mlm, args, checkpoint_data, extra_mlm_dataset=None) else: train(model, pooler, tokenizer, optimizer, scheduler, train_dataset_spc, args, checkpoint_data, extra_mlm_dataset=train_dataset_mlm)
run_tstart = time.time() for epoch in range(args.epochs): train(model, optimizer, epoch) run_time = time.time() - run_tstart run_loss, run_acc = test(model, optimizer) if args.runs > 1: print("Run #{run} Test -- time: {time}s acc: {acc:.2f}%".format( run=run, time=run_time, acc=100 * run_acc), flush=True) except KeyboardInterrupt: args.runs = run break total_loss += run_loss total_acc += run_acc total_time += run_time total_loss, total_acc, total_time = map(lambda x: x / args.runs, [total_loss, total_acc, total_time]) print("Optimization on dataset \"{dataset}\" Finished!".format( dataset=args.dataset)) print("#Parameters: {param_count}".format(param_count=count_params(model))) print("Average time elapsed: {:.4f}s".format(total_time)) # Testing print("Test set results:", "avg loss= {:.4f}".format(total_loss), "avg accuracy= {:.4f}".format(total_acc))
def train(): tf.global_variables_initializer().run() could_load, checkpoint_counter = load() if could_load: start_epoch = (int)(checkpoint_counter / num_batches) start_batch_id = checkpoint_counter - start_epoch * num_batches counter = checkpoint_counter print("Checkpoint Load Successed") else: start_epoch = 0 start_batch_id = 0 counter = 1 print("train from scratch...") train_iter=[] train_loss=[] utils.count_params() print("Total image:{}".format(len(train_img))) print("Total epoch:{}".format(args.num_epochs)) print("Batch size:{}".format(args.batch_size)) print("Learning rate:{}".format(args.learning_rate)) print("Checkpoint step:{}".format(args.checkpoint_step)) print("Data Argument:") print("h_flip: {}".format(args.h_flip)) print("v_flip: {}".format(args.v_flip)) print("rotate: {}".format(args.rotation)) print("clip size: {}".format(args.clip_size)) for i in range(start_epoch,args.num_epochs): id_list = np.random.permutation(len(train_img)) epoch_time=time.time() for j in range(start_batch_id,num_batches): img_d=[] lab_d=[] for ind in range(args.batch_size): id = id_list[j * args.batch_size + ind] img_d.append(train_img[id]) lab_d.append(train_label[id]) x_batch, y_batch = load_batch(img_d,lab_d,args) feed_dict = {img: x_batch, label: y_batch } loss_tmp = [] _, loss, pred1 = sess.run([train_step, sigmoid_cross_entropy_loss, pred], feed_dict=feed_dict) loss_tmp.append(loss) if (counter % 100 == 0): tmp = np.mean(loss_tmp) train_iter.append(counter) train_loss.append(tmp) print('Epoch', i, '|Iter', counter, '|Loss', tmp) counter += 1 start_batch_id=0 print('Time:', time.time() - epoch_time) #if((i+1)%10==0 ):#lr dst from 10 every 10 epoches by 0.1 #learning_rate = 0.1 * learning_rate #last_checkpoint_name = "checkpoint/latest_model_epoch_" + "_pspet.ckpt" # print("Saving latest checkpoint") # saver.save(sess, last_checkpoint_name) if((i+1)%args.checkpoint_step==0):#save 20,30,40,50 checkpoint args.learning_rate=0.1*args.learning_rate print(args.learning_rate) saver.save(sess,'./checkpoint/model.ckpt',global_step=counter,write_meta_graph=True) """ host = host_subplot(111) plt.subplots_adjust(right=0.8) p1, = host.plot(train_iter, train_loss, label="training loss") host.legend(loc=5) host.axis["left"].label.set_color(p1.get_color()) host.set_xlim([0, counter]) plt.draw() plt.show() """ fig1, ax1 = plt.subplots(figsize=(11, 8)) ax1.plot(train_iter, train_loss) ax1.set_title("Training loss vs Iter") ax1.set_xlabel("Iter") ax1.set_ylabel("Training loss") plt.savefig('Training loss_vs_Iter.png') plt.clf() remain_time=(args.num_epochs - 1 - i) * (time.time() - epoch_time) m, s = divmod(remain_time, 60) h, m = divmod(m, 60) print("Remaining training time = %d hours %d minutes %d seconds\n" % (h, m, s))
def num_params(self): return count_params(self)
def main(): args = parse_args() if args.name is None: if args.deepsupervision: args.name = '%s_%s_wDS' % (args.dataset, args.arch) else: args.name = '%s_%s_woDS' % (args.dataset, args.arch) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) # define loss function (criterion) if args.loss == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = losses.__dict__[args.loss]().cuda() cudnn.benchmark = True DATA_PATH = '../../Datasets/' img_paths = [] mask_paths = [] for class_folder in os.listdir(DATA_PATH): FOLDER_PATH = os.path.join(DATA_PATH, class_folder) for patient_folder in os.listdir(FOLDER_PATH): patient_folder = os.path.join(FOLDER_PATH, patient_folder) if os.path.isdir(patient_folder): if (os.path.isfile( os.path.join(patient_folder, 'AP/Ap_Pedicle.png'))): mask_paths.append( os.path.join(patient_folder, 'AP/Ap_Pedicle.png')) img_paths.append(os.path.join(patient_folder, "AP.jpg")) c = list(zip(img_paths, mask_paths)) random.shuffle(c) img_paths, mask_paths = zip(*c) img_paths = np.array(img_paths) mask_paths = np.array(mask_paths) train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \ train_test_split(img_paths, mask_paths, test_size=0.05, random_state=41) # create model print("=> creating model %s" % args.arch) model = archs.__dict__[args.arch](args) model = model.cuda() print(count_params(model)) if args.optimizer == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug) val_dataset = Dataset(args, val_img_paths, val_mask_paths) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) log = pd.DataFrame( index=[], columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) best_iou = 0 trigger = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch, args.epochs)) # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_log = validate(args, val_loader, model, criterion) print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' % (train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'])) tmp = pd.Series( [ epoch, args.lr, train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'], ], index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) log = log.append(tmp, ignore_index=True) log.to_csv('models/%s/log.csv' % args.name, index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save(model.state_dict(), './models/%s/model.pth' % args.name) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 # early stopping if not args.early_stop is None: if trigger >= args.early_stop: print("=> early stopping") break torch.cuda.empty_cache()
import data from deepsense import neptune ctx = neptune.Context() model_name = ctx.params['model'] epochs = ctx.params['epochs'] learning_rate = ctx.params['learning_rate'] ctx.tags.append(model_name) # data dataloaders = data.get_dataloaders('/input', batch_size=128) # network model = models.MODELS[model_name] optimizer = optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.CrossEntropyLoss(size_average=False) print("Network created. Number of parameters:") print(utils.count_params(model)) # training trained_model = utils.train_model(model, criterion, optimizer, dataloaders, num_epochs=epochs) utils.save_all(trained_model)
def _spec(net, xentPerExample, is_accum=False, nohess=False, randvec=False): """returns principal eig of the hessian""" if nohess: net.valtotEager = net.bzEager = net.valEager = net.valtotAccum = net.bzAccum = net.valAccum = tf.constant(0, tf.float32) net.projvec = net.projvec_op = net.projvec_corr = tf.constant(0, tf.float32) return batchsize = tf.shape(xentPerExample)[0] xent = tf.reduce_sum(xentPerExample) # decide weights from which to compute the spectral radius print('Number of trainable weights: ' + str(utils.count_params(tf.trainable_variables()))) if not net.args.specreg_bn: # don't include batch norm weights net.regularizable = [] for var in tf.trainable_variables(): if var.op.name.find('logit/dense/kernel') > -1 or var.op.name.find(r'DW') > -1: net.regularizable.append(var) print('Number of regularizable weights: ' + str(utils.count_params(net.regularizable))) else: net.regularizable = tf.trainable_variables() # do include bn weights print('Still zeroing out bias and bn variables in hessian calculation in utils.filtnorm function') # create initial projection vector (randomly and normalized) projvec_init = [np.random.randn(*r.get_shape().as_list()) for r in net.regularizable] magnitude = np.sqrt(np.sum([np.sum(p**2) for p in projvec_init])) projvec_init = [p/magnitude for p in projvec_init] # projection vector tensor variable net.count = net.count + 1 if hasattr(net, 'count') else 0 with tf.variable_scope('projvec/'+str(net.count)): net.projvec = [tf.get_variable(name=r.op.name, dtype=tf.float32, shape=r.get_shape(), trainable=False, initializer=tf.constant_initializer(p)) for r,p in zip(net.regularizable, projvec_init)] # compute filter normalization print('normalization scheme: '+net.args.normalizer) if net.args.normalizer == None or net.args.normalizer=='None': projvec_mul_normvalues = net.projvec else: if net.args.normalizer == 'filtnorm': normalizer = utils.filtnorm elif net.args.normalizer == 'layernorm': normalizer = utils.layernorm elif net.args.normalizer == 'layernormdev': normalizer = utils.layernormdev net.normvalues = normalizer(net.regularizable) projvec_mul_normvalues = [n*p for n,p in zip(net.normvalues, net.projvec)] # get gradient of loss wrt inputs tstart = time.time(); gradLoss = tf.gradients(xent, net.regularizable); print('Built gradLoss: ' + str(time.time() - tstart) + ' s') # get hessian vector product tstart = time.time() hessVecProd = tf.gradients(gradLoss, net.regularizable, projvec_mul_normvalues) # hessVecProd = [h*n for h,n in zip(hessVecProd, net.normvalues)] print('Built hessVecProd: ' + str(time.time() - tstart) + ' s') # build graph for full-batch hessian calculations which require accum ops and storage variables (for validation) if is_accum: # create op to accumulate gradients with tf.variable_scope('accum'): hessvecprodAccum = [tf.Variable(tf.zeros_like(h), trainable=False, name=h.op.name) for h in hessVecProd] batchsizeAccum = tf.Variable(0, trainable=False, name='batchsizeAccum') net.zero_op = [a.assign(tf.zeros_like(a)) for a in hessvecprodAccum] + [batchsizeAccum.assign(0)] net.accum_op = [a.assign_add(g) for a,g in zip(hessvecprodAccum, hessVecProd)] + [batchsizeAccum.assign_add(batchsize)] # compute the projected projection vector using accumulated hvps nextProjvec = compute_nextProjvec(net.projvec, hessvecprodAccum, net.projvec_beta, randvec=randvec) print('nextProjvec using accumed hvp') # hooks for total eigenvalue, batch size, and eigenvalue net.valtotAccum = utils.list2dotprod(net.projvec, hessvecprodAccum) net.bzAccum = tf.to_float(batchsizeAccum) net.valAccum = net.valtotAccum / net.bzAccum # build graph for on-the-fly per-batch hessian calcuations (for training) else: # compute the projected projection vector using instantaneous hvp nextProjvec = compute_nextProjvec(net.projvec, hessVecProd, net.projvec_beta, randvec=randvec) print('nextProjvec using instant hvp and randvec is', randvec) # hooks for total eigenvalue, batch size, and eigenvalue net.valtotEager = utils.list2dotprod(net.projvec, hessVecProd) net.bzEager = tf.to_float(batchsize) net.valEager = net.valtotEager / net.bzEager # dotprod and euclidean distance of new projection vector from previous net.projvec_corr = utils.list2dotprod(nextProjvec, net.projvec) # op to assign the new projection vector for next iteration with tf.control_dependencies([net.projvec_corr]): with tf.variable_scope('projvec_op'): net.projvec_op = [tf.assign(p,n) for p,n in zip(net.projvec, nextProjvec)]
def train(self,args): ''' import data, train model, save model ''' args.data_dir = args.data_dir+args.style+'/' args.save_dir = args.save_dir+args.style+'/' print(args) if args.attention is True: print('attention mode') text_parser = TextParser(args) args.vocab_size = text_parser.vocab_size if args.pretrained is True: raise ValueError('pretrained has bug now, so don"t set it to be True now!!!') if args.keep is False: raise ValueError('when pre-trained is True, keep must be true!') print("pretrained and keep mode...") print("restoring pretrained model file") ckpt = tf.train.get_checkpoint_state("/home/pony/github/jaylyrics_generation_tensorflow/data/pre-trained/") if os.path.exists(os.path.join("./data/pre-trained/",'config.pkl')) and \ os.path.exists(os.path.join("./data/pre-trained/",'words_vocab.pkl')) and \ ckpt and ckpt.model_checkpoint_path: with open(os.path.join("./data/pre-trained/", 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) with open(os.path.join("./data/pre-trained/", 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) else: raise ValueError('configuration doesn"t exist!') else: ckpt = tf.train.get_checkpoint_state(args.save_dir) if args.keep is True and args.pretrained is False: # check if all necessary files exist if os.path.exists(os.path.join(args.save_dir,'config.pkl')) and \ os.path.exists(os.path.join(args.save_dir,'words_vocab.pkl')) and \ ckpt and ckpt.model_checkpoint_path: with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) else: raise ValueError('configuration doesn"t exist!') if args.model == 'seq2seq_rnn': model = Model_rnn(args) else: # TO ADD OTHER MODEL pass trainable_num_params = count_params(model,mode='trainable') all_num_params = count_params(model,mode='all') args.num_trainable_params = trainable_num_params args.num_all_params = all_num_params with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((text_parser.vocab_dict, text_parser.vocab_list), f) with tf.Session() as sess: if args.keep is True: print('Restoring') model.saver.restore(sess, ckpt.model_checkpoint_path) else: print('Initializing') sess.run(model.initial_op) sess.run(tf.assign(model.lr, args.learning_rate)) for e in range(args.num_epochs): start = time.time() model.initial_state = tf.convert_to_tensor(model.initial_state) state = model.initial_state.eval() total_loss = [] for b in range(text_parser.num_batches): x, y = text_parser.next_batch() if args.attention is True: attention_states = sess.run(tf.truncated_normal([args.batch_size, model.attn_length, model.attn_size], stddev=0.1,dtype=tf.float32)) feed = {model.input_data: x, model.targets: y, model.initial_state: state, model.attention_states:attention_states} else: feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) total_loss.append(train_loss) print("{}/{} (epoch {}), train_loss = {:.3f}" \ .format(e * text_parser.num_batches + b, \ args.num_epochs * text_parser.num_batches, \ e, train_loss)) if (e*text_parser.num_batches+b)%args.save_every==0: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step = e) print("model has been saved in:"+str(checkpoint_path)) end = time.time() delta_time = end - start ave_loss = np.array(total_loss).mean() logging(model,ave_loss,e,delta_time,mode='train') if ave_loss < 0.1: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') model.saver.save(sess, checkpoint_path, global_step = e) print("model has been saved in:"+str(checkpoint_path)) break
losses = unweighted_loss * class_weights else: if args.loss_func == "cross_entropy": losses = tf.nn.softmax_cross_entropy_with_logits(logits=network, labels=net_output) elif args.loss_func == "lovasz": losses = utils.lovasz_softmax(probas=network, labels=net_output) loss = tf.reduce_mean(losses) opt = tf.train.AdamOptimizer(0.0001).minimize( loss, var_list=[var for var in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1000) sess.run(tf.global_variables_initializer()) utils.count_params() # If a pre-trained ResNet is required, load the weights. # This must be done AFTER the variables are initialized with sess.run(tf.global_variables_initializer()) if init_fn is not None: init_fn(sess) # Load a previous checkpoint if desired model_checkpoint_name = "checkpoints/latest_model_" + args.model + "_" + args.dataset + ".ckpt" if args.continue_training or not args.mode == "train": print('Loaded latest model checkpoint') saver.restore(sess, model_checkpoint_name) avg_scores_per_epoch = [] # Load the data
tf.summary.scalar("lambda", decayed_reg) summary_op = tf.summary.merge_all() name = str(lr) + '_' + str(bs) + '_' + str(nn) train_writer = tf.summary.FileWriter(tensorboard_log + name + '/train/', graph = sess.graph) val_writer = tf.summary.FileWriter(tensorboard_log + name + '/val/') # Initialize all variables sess.run(tf.global_variables_initializer()) # Load the pretrained weights into the non-trainable layer model_rgb.load_params(sess, params_dir_rgb, trainable=False) model_depth.load_params(sess, params_dir_depth, trainable=False) print("\nHyper-parameters: lr={}, #neurons={}, bs={}, l2={}, max_norm={}, dropout_rate={}".format(lr,nn,bs,aa,mn,do)) print("Number of trainable parameters = {}".format(count_params(trainable_variables_rnn)+count_params(trainable_variables_conv1x1))) print("\n{} Generate features from training set".format(datetime.now())) tb_train_count=0 tb_val_count = 0 # Loop over number of epochs num_samples = 0 # Training set sess.run(training_init_op) # Progress bar setting bar = progressbar.ProgressBar(maxval=tr_batches_per_epoch, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
def test_count_params(self): linear = nn.Linear(123, 42) n_weights = 123 * 42 n_bias = 42 n_total = n_weights + n_bias self.assertEqual(n_total, count_params(linear))
def main(): args = parse_args() if args.name is None: if args.deepsupervision: args.name = '%s_%s_wDS' % (args.dataset, args.arch) else: args.name = '%s_%s_woDS' % (args.dataset, args.arch) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) # define loss function (criterion) if args.loss == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = losses.__dict__[args.loss]().cuda() cudnn.benchmark = True DATA_PATH = '../../Datasets/' img_paths = [] mask_paths = [] for class_folder in os.listdir(DATA_PATH): FOLDER_PATH = os.path.join(DATA_PATH, class_folder) for patient_folder in os.listdir(FOLDER_PATH): patient_folder = os.path.join(FOLDER_PATH, patient_folder) if os.path.isdir(patient_folder): if (os.path.isfile( os.path.join(patient_folder, 'LAT/Lat_Vertebra.png'))): mask_paths.append( os.path.join(patient_folder, 'LAT/Lat_Vertebra.png')) img_paths.append(os.path.join(patient_folder, "LAT.jpg")) c = list(zip(img_paths, mask_paths)) random.shuffle(c) img_paths, mask_paths = zip(*c) img_paths = np.array(img_paths) mask_paths = np.array(mask_paths) k = 10 kf = KFold(n_splits=k) fold_num = 0 mean_ious = [] for train_index, test_index in kf.split(img_paths): train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \ train_test_split(img_paths[train_index], mask_paths[train_index], test_size=0.08, random_state=41) # create model print("=> creating model %s for fold %s" % (args.arch, fold_num)) fold_num += 1 model = archs.__dict__[args.arch](args) model = model.cuda() print(count_params(model)) if args.optimizer == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug) val_dataset = Dataset(args, val_img_paths, val_mask_paths) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) log = pd.DataFrame( index=[], columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) best_iou = 0 trigger = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch, args.epochs)) # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_log = validate(args, val_loader, model, criterion) print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' % (train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'])) tmp = pd.Series( [ epoch, args.lr, train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'], ], index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou']) log = log.append(tmp, ignore_index=True) log.to_csv('models/%s/log.csv' % args.name, index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save(model.state_dict(), './models/%s/model.pth' % args.name) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 # early stopping if not args.early_stop is None: if trigger >= args.early_stop: print("=> early stopping") break torch.cuda.empty_cache() args = joblib.load('models/%s/args.pkl' % args.name) if not os.path.exists('output/%s' % args.name): os.makedirs('output/%s' % args.name) joblib.dump(args, 'models/%s/args.pkl' % args.name) # create model print("=> Testing model %s" % args.arch) model = archs.__dict__[args.arch](args) model = model.cuda() test_img_paths, test_mask_paths = img_paths[test_index], mask_paths[ test_index] input_paths = test_img_paths model.load_state_dict(torch.load('models/%s/model.pth' % args.name)) model.eval() test_dataset = Dataset(args, test_img_paths, test_mask_paths) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) with warnings.catch_warnings(): warnings.simplefilter('ignore') with torch.no_grad(): for i, (input, target) in tqdm(enumerate(test_loader), total=len(test_loader)): input = input.cuda() target = target.cuda() # compute output if args.deepsupervision: output = model(input)[-1] else: output = model(input) output = torch.sigmoid(output).data.cpu().numpy() test_img_paths = test_img_paths[args.batch_size * i:args.batch_size * (i + 1)] imsave( os.path.join("./output/%s" % args.name, str(i) + ".png"), (output[0, 0, :, :] * 255).astype('uint8')) torch.cuda.empty_cache() # IoU ious = [] for i in tqdm(range(len(test_mask_paths))): input_img = cv2.imread(input_paths[i], 1)[:, :, 0] input_img = cv2.resize(input_img, (256, 256)) mask = np.zeros((256, 256)) _mask = cv2.imread(test_mask_paths[i])[:, :, 0] _mask = cv2.resize(_mask, (256, 256)) mask = np.maximum(mask, _mask) pb = imread('output/%s/' % args.name + str(i) + ".png") mask = mask.astype('float32') / 255 pb = pb.astype('float32') / 255 iou = iou_score(pb, mask) ious.append(iou) mean_ious.append(np.mean(ious)) print("\n") print(mean_ious) print(np.mean(mean_ious))