def main(): test_ids = np.array([x[:-4] for x in os.listdir(args.test_folder) if x[-4:] == '.png']) MODEL_PATH = os.path.join(args.models_dir, args.network + args.alias) folds = [int(f) for f in args.fold.split(',')] print('Predicting Model:', args.network + args.alias) for fold in folds: K.clear_session() print('***************************** FOLD {} *****************************'.format(fold)) # Initialize Model weights_path = os.path.join(MODEL_PATH, args.prediction_weights.format(fold)) model, preprocess = get_model(args.network, input_shape=(args.input_size, args.input_size, 3), freeze_encoder=args.freeze_encoder) model.compile(optimizer=RMSprop(lr=args.learning_rate), loss=make_loss(args.loss_function), metrics=[Kaggle_IoU_Precision]) model.load_weights(weights_path) # Save test predictions to disk dir_path = os.path.join(MODEL_PATH, args.prediction_folder.format(fold)) os.system("mkdir {}".format(dir_path)) predict_test(model=model, preds_path=dir_path, ids=test_ids, batch_size=args.batch_size * 2, TTA='flip', preprocess=preprocess) gc.collect()
def _init_params(self): self.net = get_net(self.config['model'], self.config['load_weights']) self.net.cuda() self.model = get_model(self.config['model']) self.criterion = get_loss(self.config['model']) self.optimizer = self._get_optim() self.scheduler = optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=[40, 55, 70, 95], gamma=0.5)
def load_model(self): model_path = self.config.paths.model dataset = self.config.dataset logging.info('dataset: {}'.format(dataset)) # Set up global model model = models.get_model(dataset) logging.debug(model) return model
def _init_params(self): self.criterionG, criterionD = get_loss(self.config['model']) self.netG, netD = get_nets(self.config['model']) self.netG.to(self.device) self.adv_trainer = self._get_adversarial_trainer(self.config['model']['d_name'], netD, criterionD) self.model = get_model(self.config['model']) self.optimizer_G = self._get_optim(filter(lambda p: p.requires_grad, self.netG.parameters())) self.optimizer_D = self._get_optim(self.adv_trainer.get_params()) self.scheduler_G = self._get_scheduler(self.optimizer_G) self.scheduler_D = self._get_scheduler(self.optimizer_D)
def _init_params(self): self.criterionG, criterionD = get_loss(self.config['model']) self.netG, netD = get_nets(self.config['model']) self.netG.cuda() self.adv_trainer = self._get_adversarial_trainer( self.config['model']['d_name'], netD, criterionD) self.model = get_model(self.config['model']) self.optimizer_G = self._get_optim( filter(lambda p: p.requires_grad, self.netG.parameters())) self.optimizer_D = self._get_optim(self.adv_trainer.get_params()) self.scheduler_G = self._get_scheduler(self.optimizer_G) self.scheduler_D = self._get_scheduler(self.optimizer_D) # load state dict self.netG.load_state_dict( torch.load("best_fpn.h5", map_location='cpu')['model'])
def _init_params(self): self.criterionG, criterionD = get_loss(self.config['model']) netG, netD = get_nets(self.config['model']) model = netG.cuda() ############ model 加载 继续训练 checkpoint = torch.load('best_{}.h5'.format( self.config['experiment_desc'])) model.load_state_dict(checkpoint['model']) self.netG = model self.adv_trainer = self._get_adversarial_trainer( self.config['model']['d_name'], netD, criterionD) self.model = get_model(self.config['model']) self.optimizer_G = self._get_optim( filter(lambda p: p.requires_grad, self.netG.parameters())) self.optimizer_D = self._get_optim(self.adv_trainer.get_params()) self.scheduler_G = self._get_scheduler(self.optimizer_G) self.scheduler_D = self._get_scheduler(self.optimizer_D)
def main(): parser = argparse.ArgumentParser( description='Binary MRI Quality Classification') parser.add_argument('--yaml_path', type=str, metavar='YAML', default="config/acdc_binary_classification.yaml", help='Enter the path for the YAML config') args = parser.parse_args() yaml.add_constructor("!join", yaml_var_concat) yaml_path = args.yaml_path with open(yaml_path, 'r') as f: train_args = yaml.load(f, Loader=yaml.Loader) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") composed = transforms.Compose([ Resize((224, 224)), OneToThreeDimension(), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) acdc_dataset = ACDCDataset(train_args["pos_samps_test"], train_args["neg_samps_test"], transform=composed) dataloader = DataLoader(acdc_dataset, batch_size=train_args["batch_size"], shuffle=False, num_workers=4) dataset_size = len(acdc_dataset) model_ft = get_model(train_args["model"], device, pretrained=train_args["pretrained"]) state = get_most_recent_model(train_args["model"], train_args["model_save_dir"]) model_ft.load_state_dict(state) test(model_ft, dataloader, dataset_size, device=device)
def keras_fit_generator(): kfolds = [0] # kfolds = [2, 3, 4] for fold in kfolds: K.clear_session() print('fold = {}'.format(fold)) print('begin load data') X_train, y_train, X_val, y_val = load_data(fold) print(X_train.shape, y_train.shape, X_val.shape, y_val.shape) # (1250, 256, 256, 1) (1250, 256, 256, 1) (127, 256, 256, 1) (127, 256, 256, 1) print('load data over') model, process = get_model(network=network, input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3]), freeze_encoder=False) model.load_weights(pretrain_weight + str(fold) + '.hdf5') val_gen = Generator(X_val, y_val, batch_size=len(y_val), shuffle=True, aug=True, process=process) X_val_steps, y_val_steps = next(val_gen.generator) train_gen = Generator(X_train, y_train, batch_size=batch_size, shuffle=True, aug=True, process=process) model.compile(optimizer=Adam(lr=learning_rate), loss=make_loss(loss_name=loss_function), metrics=[dice_coef]) model.summary() c_backs = get_callback(callback, fold, num_sample=len(X_train)) model.fit_generator( train_gen.generator, steps_per_epoch=(len(X_train)//batch_size)*2, epochs=epochs, verbose=1, shuffle=True, validation_data=(X_val_steps, y_val_steps), callbacks=c_backs, use_multiprocessing=False) gc.collect()
def main(args): time_stamp = '{:.0f}'.format(time.time() % 100000) if torch.cuda.is_available() is True: logging.info('Utilizing GPU', extra=args.client) print('Utilizing GPU') train_loader, val_loader = load_data(args) model = get_model(args.model, args) if args.batch_size > 256: # and args.dataset == 'imagenet' and args.model == 'resnet': batch_accumulate_num = args.batch_size // 256 else: batch_accumulate_num = 1 # create model if args.dataset == 'imagenet': if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256 else: args.iterations_per_epoch = len(train_loader.dataset.imgs) // args.batch_size val_len = len(val_loader.dataset.imgs) // 1024 else: if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.train_labels) // 256 else: args.iterations_per_epoch = len(train_loader.dataset.train_labels) // args.batch_size val_len = len(val_loader.dataset.test_labels) // 1024 # get the number of model parameters log_str = 'Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])) logging.info(log_str, extra=args.client) print(log_str) # for training on multiple GPUs. model = torch.nn.DataParallel(model) model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume + '/checkpoint.pth.tar'): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume + '/checkpoint.pth.tar') args.start_epoch = checkpoint['epoch'] server = checkpoint['server'] val_statistics = checkpoint['val_stats'] train_statistics = checkpoint['train_stats'] model.load_state_dict(checkpoint['state_dict']) print('=> loaded checkpoint {} (epoch {})'.format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params).cuda() # criterion = nn.CrossEntropyLoss().cuda() if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%') else: train_bar = None val_bar = None log_str = '{}: Training neural network for {} epochs with {} workers'.format(args.id, args.epochs, args.workers_num) logging.info(log_str, extra=args.client) print(log_str) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, batch_accumulate_num, train_bar, train_statistics, args.client) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() with torch.no_grad(): val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar) train_statistics.save_loss(train_loss) train_statistics.save_error(train_error) train_statistics.save_weight_mean_dist(server.get_workers_mean_statistics()) train_statistics.save_weight_master_dist(server.get_workers_master_statistics()) train_statistics.save_mean_master_dist(server.get_mean_master_dist()) train_statistics.save_weight_norm(server.get_server_weights()) train_statistics.save_gradient_norm(server.get_server_gradients()) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \ 'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss, train_error, val_time, val_loss, val_error) logging.info(log_str, extra=args.client) print(log_str) if epoch % args.save == 0 and epoch > 0: save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'val_stats': val_statistics, 'train_stats': train_statistics, 'server': server}, sim_name=(args.name + time_stamp + '_' + str(epoch))) train_time = time.time() return train_statistics, val_statistics
def train_model(args): train_samples, val_samples = get_num_samples(args.test_file, args.train_file) img_path = '' m.doFlip = args.doFlip m.doScale = args.doScale generator_test_batch, generator_train_batch, generator_val_batch, model = m.get_model(args) if os.path.isfile(args.save_path + '/weights.h5') and args.load_opt_train == 0: # model.summary() model.load_weights(args.save_path+ '/weights.h5') return model, generator_train_batch, generator_val_batch, generator_test_batch lr = 0.0005 # org lr = 0.05 # OK for C3D # lr = 0.005 # opt = SGD(lr=lr, momentum=0.9, nesterov=True) opt = Adam(lr=lr, decay=0.9) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) # model.summary() # Save optimazer state if args.load_opt_train == 1: model.load_weights(args.save_path + '/weights.h5') model._make_train_function() with open(args.save_path + 'optimizer.pkl', 'rb') as f: weight_values = pickle.load(f) model.optimizer.set_weights(weight_values) print(args) history = model.fit_generator(generator_train_batch(args.train_file, args.batch_size, args.num_classes, img_path), steps_per_epoch=train_samples // args.batch_size, epochs=args.epochs, # callbacks=[onetenth_4_8_12(lr)], validation_data=generator_val_batch(args.test_file, args.batch_size, args.num_classes, img_path), validation_steps=val_samples // args.batch_size, verbose=1) if not os.path.exists(args.save_path): os.makedirs(args.save_path) plot_history(history, args.save_path) save_history(history, args.save_path) # Save model model.save_weights(args.save_path + '/weights.h5') # Save optimazer state if args.save_opt_train == 1: symbolic_weights = getattr(model.optimizer, 'weights') weight_values = K.batch_get_value(symbolic_weights) with open(args.save_path + '/optimizer.pkl', 'wb') as f: pickle.dump(weight_values, f) with open(args.save_path + '/settings.txt', 'w') as outfile: outfile.write('num_classes:\t' + str(args.num_classes) + '\n') outfile.write('batch_size:\t' + str(args.batch_size) + '\n') outfile.write('epochs:\t' + str(args.epochs) + '\n') outfile.write('img_path:\t' + str(img_path) + '\n') outfile.write('train_file:\t' + str(args.train_file) + '\n') outfile.write('test_file:\t' + str(args.test_file) + '\n') outfile.write('lr:\t' + str(lr) + '\n') return model, generator_train_batch, generator_val_batch, generator_test_batch
elif o in ("-t", "--tag"): settings['tag'] = a else: assert False, "unhandled option" model_eval = None try: model_eval = eval("Model." + model.upper()) except AttributeError: print "You have selected a model that doesn't exist. Defaulting to RANDOM" model_eval = Model.RANDOM nova = client.Client(username, api_key, project_id, auth_url) all_servers = nova.servers.list() servers = [x for x in all_servers if x.status == "ACTIVE"] if 'tag' in settings: servers = [x for x in servers if settings['tag'] in x.metadata and x.metadata[settings['tag']] == '1'] if len(servers) == 0: print "No servers found. Exiting now..." sys.exit() FailureModel = get_model(model_eval) m = FailureModel(nova, servers, settings) m.anarchy() if __name__ == "__main__": main()
print('==> Preparing data..') data = ds[args.dataset] meta = dsmeta[args.dataset] classes, nc, size = meta['classes'], meta['nc'], meta['size'] trainset, valset, testset = data(args) # Toxic comments uses its own data loaders trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) if (trainset is not None) and (args.dataset not in nlp_data) else trainset valloader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) if (valset is not None) and (args.dataset not in nlp_data) else valset testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) if (args.dataset not in nlp_data) else testset print('==> Building model..') net = get_model(args, classes, nc) net = nn.DataParallel(net) if args.parallel else net optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) if (args.dataset in nlp_data) or ('modelnet' in args.dataset): optimizer = optim.Adam(net.parameters(), lr=args.lr) print('==> Setting up callbacks..') current_time = datetime.now().strftime('%b%d_%H-%M-%S') + "-run-" + str(args.run_id) tboard = TensorBoard(write_graph=False, comment=current_time, log_dir=args.log_dir) tboardtext = TensorBoardText(write_epoch_metrics=False, comment=current_time, log_dir=args.log_dir) @torchbearer.callbacks.on_start def write_params(_):
def main(): #读入数据,并存为列表,每一个item为列表中的一个元素 train = pd.read_csv(args.folds_csv) #设置的模型保存路径 MODEL_PATH = os.path.join(args.models_dir, args.network + args.alias) #将数据分折,原始输入数据就是预先分好折的,这个是存储的分折信息,12345列表元素 folds = [int(f) for f in args.fold.split(',')] print('Training Model:', args.network + args.alias) for fold in folds: K.clear_session() print('***************************** FOLD {} *****************************'.format(fold)) if fold == 0: if os.path.isdir(MODEL_PATH): raise ValueError('Such Model already exists') os.system("mkdir {}".format(MODEL_PATH)) # Train/Validation sampling df_train = train[train.fold != fold].copy().reset_index(drop=True) df_valid = train[train.fold == fold].copy().reset_index(drop=True) # Train on pseudolabels only if args.pseudolabels_dir != '': pseudolabels = pd.read_csv(args.pseudolabels_csv) df_train = pseudolabels.sample(frac=1, random_state=13).reset_index(drop=True) # Keep only non-black images ids_train, ids_valid = df_train[df_train.unique_pixels > 1].id.values, df_valid[ df_valid.unique_pixels > 1].id.values print('Training on {} samples'.format(ids_train.shape[0])) print('Validating on {} samples'.format(ids_valid.shape[0])) # Initialize model weights_path = os.path.join(MODEL_PATH, 'fold_{fold}.hdf5'.format(fold=fold)) # Get the model model, preprocess = get_model(args.network, input_shape=(args.input_size, args.input_size, 3), freeze_encoder=args.freeze_encoder) # LB metric threshold def lb_metric(y_true, y_pred): return Kaggle_IoU_Precision(y_true, y_pred, threshold=0 if args.loss_function == 'lovasz' else 0.5) model.compile(optimizer=RMSprop(lr=args.learning_rate), loss=make_loss(args.loss_function), metrics=[lb_metric]) if args.pretrain_weights is None: print('No weights passed, training from scratch') else: wp = args.pretrain_weights.format(fold) print('Loading weights from {}'.format(wp)) model.load_weights(wp, by_name=True) # Get augmentations augs = get_augmentations(args.augmentation_name, p=args.augmentation_prob) # Data generator dg = SegmentationDataGenerator(input_shape=(args.input_size, args.input_size), batch_size=args.batch_size, augs=augs, preprocess=preprocess) train_generator = dg.train_batch_generator(ids_train) validation_generator = dg.evaluation_batch_generator(ids_valid) # Get callbacks callbacks = get_callback(args.callback, weights_path=weights_path, fold=fold) # Fit the model with Generators: model.fit_generator(generator=ThreadsafeIter(train_generator), steps_per_epoch=ids_train.shape[0] // args.batch_size * 2, epochs=args.epochs, callbacks=callbacks, validation_data=ThreadsafeIter(validation_generator), validation_steps=np.ceil(ids_valid.shape[0] / args.batch_size), workers=args.num_workers) gc.collect()
OneToThreeDimension(), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) acdc_dataset = {x: ACDCDataset(train_args["pos_samps_"+x], train_args["neg_samps_"+x], transform=composed) for x in ["train", "val", "test"]} dataloader = {x: DataLoader(acdc_dataset[x], batch_size=train_args["batch_size"], shuffle=True, num_workers=4, # sampler=sampler[x] ) for x in ["train", "val", "test"]} dataset_sizes = {x: len(acdc_dataset[x]) for x in ["train", "val", "test"]} model_ft = get_model(train_args["model"], device, pretrained=train_args["pretrained"]) criterion = get_loss(train_args["loss_name"]) optimizer_ft = optim.Adam(model_ft.parameters(), lr=1e-5) model_ft = train(model_ft, criterion, optimizer_ft, num_epochs=train_args["epoch"]) test(model_ft, dataloader["test"], dataset_sizes["test"])
def _init_params(self): self.netG, netD = get_nets(self.config['model']) self.netG.cuda() self.model = get_model(self.config['model'])
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Train {0}: {1}'.format(args.env, rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = database_env(env_conf, seed=0) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None, gpu_id) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, env_conf["num_actions"], gpu_id=0, lstm_feats=args.lstm_feats) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_reward eps_reward = 0 if args.lstm_feats: player.cx, player.hx = init_linear_lstm( args.lstm_feats, gpu_id) elif args.lstm_feats: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if rank == 0: eps_reward = player.env.sum_reward if player.done: break if player.done: if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) # print ("rewards: ", player.env.rewards) # print ("actions: ", player.actions) if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1, 1, 1) if not player.done: if args.lstm_feats: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1, 1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) sum_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if rank == 0: train_step += 1 if train_step % args.log_period == 0 and train_step > 0: log_info = { 'sum_loss': sum_loss, 'value_loss': value_loss, 'policy_loss': policy_loss, 'advanage': advantage, 'train eps reward': pinned_eps_reward, } for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
os.environ["CUDA_VISIBLE_DEVICES"] = "0" set_global_seed(args.seed) prepare_cudnn(deterministic=True) sub_name = f'Model_{args.task}_{args.model_type}_{args.encoder}_bs_{args.bs}_{str(datetime.datetime.now().date())}' logdir = f"./logs/{sub_name}" if args.logdir is None else args.logdir preprocessing_fn = smp.encoders.get_preprocessing_fn(args.encoder, args.encoder_weights) loaders = prepare_loaders(path=args.path, bs=args.bs, num_workers=args.num_workers, preprocessing_fn=preprocessing_fn, preload=args.preload, image_size=(args.height, args.width), augmentation=args.augmentation, task=args.task) test_loader = loaders['test'] del loaders['test'] model = get_model(model_type=args.segm_type, encoder=args.encoder, encoder_weights=args.encoder_weights, activation=None, task=args.task) optimizer = get_optimizer(optimizer=args.optimizer, lookahead=args.lookahead, model=model, separate_decoder=args.separate_decoder, lr=args.lr, lr_e=args.lr_e) if args.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, factor=0.6, patience=3) else: scheduler = ReduceLROnPlateau(optimizer, factor=0.3, patience=3) if args.loss == 'BCEDiceLoss': criterion = smp.utils.losses.BCEDiceLoss(eps=1.) elif args.loss == 'BCEJaccardLoss': criterion = smp.utils.losses.BCEJaccardLoss(eps=1.) elif args.loss == 'FocalLoss': criterion = FocalLoss()
def test(args, shared_model, env_conf): ptitle('Valid agent') if args.valid_gpu < 0: gpu_id = args.gpu_ids[-1] else: gpu_id = args.valid_gpu env_conf["env_gpu"] = gpu_id log = {} logger = Logger(args.log_dir) create_dir(args.log_dir + "models/") os.system("cp *.sh " + args.log_dir) os.system("cp *.py " + args.log_dir) os.system("cp models/models.py " + args.log_dir + "models/") os.system("cp models/basic_modules.py " + args.log_dir + "models/") setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) env_conf_log = env_conf for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) for k in env_conf_log.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, env_conf_log[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = database_env(env_conf, seed=0, dstype="test") env.max_step = 900 reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, gpu_id) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, env_conf["num_actions"], gpu_id=0, lstm_feats=args.lstm_feats) with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model.eval() flag = True create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if args.save_max and recent_episode_scores.mean() >= max_score: max_score = recent_episode_scores.mean() if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = {} state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: print("------------------------------------------------") print(args.env) print("Log test #:", num_tests) print("sum rewards: ", player.env.sum_reward) print("action_history\n", player.env.action_his) print() print("------------------------------------------------") log_info = { 'mean_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean() } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) reward_sum = 0 player.eps_len = 0 player.clear_actions() state = player.env.reset() time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
# In[8]: def sig_iou_score(y_true, y_pred): return iou_score(y_true,tf.math.sigmoid(y_pred)) # tf.math.sigmoid(y_pred) def sigm_binary_accuracy(y_true, y_pred): return binary_accuracy(y_true, tf.math.sigmoid(y_pred)) #tf.math.sigmoid(y_pred) loss_function ='seloss' # Get the model model = get_model(network = 'unet_resnext_50_margo',input_shape=(512, 512, 3), freeze_encoder=False) Adam_opt =Adam(lr=0.00005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)#, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.001, amsgrad=True) # In[ ]: loss_history = [] weight_path = "/data/margokat/models_saved/inria/{}_weights.best.hdf5".format('resnext50_unet_margo_se') checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=2, verbose=1, mode='auto', epsilon=0.0001, cooldown=4, min_lr=0.0001) early = EarlyStopping(monitor="val_loss",
if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') env_conf = setup_env_conf(args) shared_model = get_model(args, args.model, env_conf["observation_shape"], args.features, env_conf["num_actions"], gpu_id=-1, lstm_feats=args.lstm_feats) if args.load: saved_state = torch.load(args.load, map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(),
def main(model='resnet18', rep_dim=490, dataset='curated', base_path=None, unzip=False, ae_train=True, clf_train=True, ae_epochs=100, clf_epochs=100, batch_size=4, accumulation_steps=32, ae_loadfile=None, clf_loadfile=None, save_model=True, ae_test=True, accumulate=False): ''' model : CNN architecture to use ['LeNet', 'VGG', ...] data : 'curated' or 'full' base_path : path/to/ChestXRay eg. /home/paperspace/ChestXRay ''' if base_path is None: raise ValueError('Please point base_path to ChestXRay/') if ae_train and (ae_loadfile or clf_loadfile): raise ValueError( 'Please either set ae_train to True or specify a loadfile but not both.' ) filename = setup_logging(base_path=base_path, model=model, rep_dim=rep_dim) logger = logging.getLogger() logging.info('Architecture : {}'.format(model)) logging.info('Representaion Dimensionality : {}'.format(rep_dim)) logging.info('Dataset : {}'.format(dataset)) if unzip: unzip_data(base_path) trainloader = get_dataloader(dataset=dataset, set_='train', batch_size=batch_size) testloader = get_dataloader(dataset=dataset, set_='test', batch_size=batch_size) #autoencoder = resnet18(num_classes=490, autoencoder=True) autoencoder = get_model(model=model, kind='autoencoder', rep_dim=rep_dim) if ae_loadfile is not None: ae_load_path = os.path.join(base_path, 'models/saved_models/') + ae_loadfile autoencoder.load_state_dict(torch.load(ae_load_path), strict=False) if ae_train: autoencoder = pretrain(trainloader=trainloader, autoencoder=autoencoder, ae_epochs=ae_epochs, accumulation_steps=accumulation_steps, accumulate=accumulate) if save_model: save_path = os.path.join( base_path, 'models/saved_models/') + 'ae: ' + filename + '.pt' torch.save(autoencoder.state_dict(), save_path) if ae_test: pretest(testloader=testloader, autoencoder=autoencoder) del autoencoder classifier = get_model(model=model, kind='classifier', rep_dim=rep_dim) classifier.load_state_dict(torch.load(save_path), strict=False) if clf_loadfile is not None: clf_load_path = os.path.join(base_path, 'models/saved_models/') + clf_loadfile classifier.load_state_dict(torch.load(clf_load_path), strict=False) c = find_center(trainloader=trainloader, classifier=classifier, rep_dim=rep_dim) if clf_train: classifier = train(trainloader=trainloader, classifier=classifier, clf_epochs=clf_epochs, accumulation_steps=accumulation_steps, c=c, accumulate=accumulate) if save_model: save_path = os.path.join( base_path, 'models/saved_models/') + 'clf: ' + filename + '.pt' torch.save(classifier.state_dict(), save_path) test(testloader=testloader, classifier=classifier, c=c) return
def main(args): time_stamp = '{:.0f}'.format(time.time() % 100000) if torch.cuda.is_available() is True: logging.info('Utilizing GPU', extra=args.client) print('Utilizing GPU') train_loader, val_loader = load_data(args) model = get_model(args.model, args) if args.batch_size > 256 and args.dataset == 'imagenet' and args.model == 'resnet': batch_accumulate_num = args.batch_size // 256 else: batch_accumulate_num = 1 # create model if args.dataset == 'imagenet': if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256 else: args.iterations_per_epoch = len( train_loader.dataset.imgs) // args.batch_size val_len = len(val_loader.dataset.imgs) // 1024 else: args.iterations_per_epoch = len( train_loader.dataset.train_labels) // args.batch_size val_len = len(val_loader.dataset.test_labels) // 1024 # get the number of model parameters log_str = 'Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()])) logging.info(log_str, extra=args.client) print(log_str) # for training on multiple GPUs. model = torch.nn.DataParallel(model) model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume + '/checkpoint.pth.tar'): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume + '/checkpoint.pth.tar') args.start_epoch = checkpoint['epoch'] server = checkpoint['server'] val_statistics = checkpoint['val_stats'] train_statistics = checkpoint['train_stats'] model.load_state_dict(checkpoint['state_dict']) print('=> loaded checkpoint {} (epoch {})'.format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # # Synchronous to Asynchronous Adjustments # print('Resetting Parameter Server to Asynchronous Mode') # logging.info('Resetting Parameter Server to Asynchronous Mode', extra=args.client) # server._shards_weights = list() # weights = server._get_model_weights() # for i in range(0, args.workers_num): # server._shards_weights.append(deepcopy(weights)) # server._workers_num = args.workers_num # # learning rate initialization # batch_baseline = args.baseline # server._lr = args.lr * np.sqrt((args.workers_num * args.batch_size) // batch_baseline) / (args.workers_num) # server._fast_im = args.fast_im # server._lr_warm_up = args.lr_warm_up # server._current_lr = args.lr # server._m_off = args.m_off # server._current_momentum = args.momentum # server._iterations_per_epoch = args.iterations_per_epoch # server._momentum = args.momentum # server._client = args.client # if args.fast_im is True: # end_lr = args.lr * ((args.workers_num * args.batch_size) // batch_baseline) / np.sqrt(args.workers_num) # start_lr = args.lr / (args.workers_num) # server._lr = end_lr # server._start_lr = start_lr # server._lr_increment_const = (end_lr - start_lr) / (args.iterations_per_epoch * 5) # log_str = 'Fast ImageNet Mode - Warm Up [{:.5f}]->[{:.5f}] In 5 Epochs'.format(start_lr, end_lr) # logging.info(log_str, extra=args.client) # print(log_str) # else: # server._start_lr = 0 # server._lr_increment_const = 0 # for param_group in server._optimizer.param_groups: # param_group['lr'] = start_lr # param_group['momentum'] = server._momentum # # Synchronous to Asynchronous Adjustments - End cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%') else: train_bar = None val_bar = None log_str = '{}: Training neural network for {} epochs with {} workers'.format( args.id, args.epochs, args.workers_num) logging.info(log_str, extra=args.client) print(log_str) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, batch_accumulate_num, train_bar, train_statistics, args.client) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() with torch.no_grad(): val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar) train_statistics.save_loss(train_loss) train_statistics.save_error(train_error) train_statistics.save_weight_mean_dist( server.get_workers_mean_statistics()) train_statistics.save_weight_master_dist( server.get_workers_master_statistics()) train_statistics.save_mean_master_dist(server.get_mean_master_dist()) train_statistics.save_weight_norm(server.get_server_weights()) train_statistics.save_gradient_norm(server.get_server_gradients()) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \ 'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss, train_error, val_time, val_loss, val_error) logging.info(log_str, extra=args.client) print(log_str) if epoch % args.save == 0 and epoch > 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'val_stats': val_statistics, 'train_stats': train_statistics, 'server': server }, sim_name=(args.name + time_stamp + '_' + str(epoch))) train_time = time.time() return train_statistics, val_statistics
t_initial_vertex_features = tf.placeholder(dtype=tf.float32, shape=[None, 1]) t_vertex_coord_list = [tf.placeholder(dtype=tf.float32, shape=[None, 3])] for _ in range(len(config['runtime_graph_gen_kwargs']['level_configs'])): t_vertex_coord_list.append( tf.placeholder(dtype=tf.float32, shape=[None, 3])) t_edges_list = [] for _ in range(len(config['runtime_graph_gen_kwargs']['level_configs'])): t_edges_list.append(tf.placeholder(dtype=tf.int32, shape=[None, 2])) t_keypoint_indices_list = [] for _ in range(len(config['runtime_graph_gen_kwargs']['level_configs'])): t_keypoint_indices_list.append( tf.placeholder(dtype=tf.int32, shape=[None, 1])) t_is_training = tf.placeholder(dtype=tf.bool, shape=[]) model = get_model(config['model_name'])(num_classes=NUM_CLASSES, box_encoding_len=BOX_ENCODING_LEN, mode='test', **config['model_kwargs']) t_logits, t_pred_box = model.predict(t_initial_vertex_features, t_vertex_coord_list, t_keypoint_indices_list, t_edges_list, t_is_training) t_probs = model.postprocess(t_logits) t_predictions = tf.argmax(t_probs, axis=1, output_type=tf.int32) # optimizers ================================================================== global_step = tf.Variable(0, dtype=tf.int32, trainable=False) fetches = { 'step': global_step, 'predictions': t_predictions, 'probs': t_probs, 'pred_box': t_pred_box }
def get_models(img_rows, img_cols, fold): model, process = get_model(network=network, input_shape=(img_rows, img_cols, 1), freeze_encoder=False) model.load_weights(weights_path + str(fold) + '.hdf5') model.compile(optimizer=Adam(lr=learning_rate), loss=make_loss(loss_name=loss_function), metrics=[dice_coef]) return model, process