def build_and_train(game="pong", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game #log_dir = "example_1" log_dir = get_outputs_path() with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def test(experiment, model, test_loader, cuda): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: if cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, requires_grad=False), Variable(target) output = model(data) test_loss += F.nll_loss(output, target, size_average=False).item() # sum up batch loss pred = output.data.max( 1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.data.view_as(pred)).cpu().sum().item() test_loss /= len(test_loader.dataset) accuracy = correct / len(test_loader.dataset) logging.info( 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * accuracy)) output_path = get_outputs_path() model_path = os.path.join(output_path, "model.dat") torch.save(model.state_dict(), model_path) experiment.log_metrics(loss=test_loss, accuracy=accuracy)
def run_experiment(data_path, glove_path): # try: log_level = get_log_level() if not log_level: log_level = logging.INFO logger.info("Starting experiment") experiment = Experiment() logging.basicConfig(level=log_level) logging.info("Loading data") dpl = Datapipeline(data_path=data_path) dpl.transform() train, val = dpl.split_data() logging.info("Data loaded") model = twitter_model(glove_path=glove_path) model.build_model(train.values) model.get_train_data(train.values) output_model = model.train() filepath = os.path.join(get_outputs_path(), "trump_bot.h5") # metrics = model.train(params) # # experiment.log_metrics(**metrics) # save model output_model.save(filepath) logger.info("Experiment completed")
def get_output_path(alternative): if not is_in_cluster(): return alternative output_path = get_outputs_path() if output_path is None: output_path = alternative if not tf.gfile.Exists(output_path): tf.gfile.MakeDirs(output_path) return output_path
def define_prepare_tb_path(): logdir_tb = os.path.join(".", "tf_logs", "scalars") # ".\\tf_logs\\scalars\\" outputs_path = get_outputs_path() if outputs_path is not None: # polyaxon behavior logdir_tb = outputs_path + "/" + logdir_tb else: # local behavior logdir_tb = logdir_tb + datetime.now().strftime("%Y%m%d-%H%M%S") return logdir_tb
def main(): """ """ parser = argparse.ArgumentParser() parser.add_argument('--serialization_dir', type=str, help='The directory where to save trained models, etc.') parser.add_argument('--params', type=str, help='path to the parameter file describing the tasks to train.') parser.add_argument('--seed', type=int, default=1, help='The random seed to use for the initialization of PyTorch and numpy.') parser.add_argument('--recover', action='store_true', help='Recover from a previous experiment?') args = parser.parse_args() # Import user defined modules utils.import_user_module(args) # If we are in polyaxon redirect if IN_CLUSTER: args.serialization_dir = get_outputs_path() # Set the random seed np.random.seed(args.seed) torch.manual_seed(args.seed) # Read the parameter file params = Params.from_file(args.params) serialization_dir = args.serialization_dir # Create the serialization directory create_serialization_dir(serialization_dir) # Write the parameter file to the output directory with open(os.path.join(serialization_dir, 'config.json'), 'w') as fout: json.dump(deepcopy(params).as_dict(quiet=True), fout, indent=2) # Call the tasks_and_vocab_from_params method tasks, vocab = tasks_and_vocab_from_params(params=params, serialization_dir=serialization_dir) # Load the data iterator for all tasks # Create the model model_params = params.pop("model") model = BaseFairseqModel.from_params(vocab=vocab, params=model_params) LOGGER.info("created model") print("created model: {}".format(model)) # Finally, create an instance of the required trainer trainer_params = params.pop("trainer") # TODO(naetherm): Dependent on the trainer type ... trainer = BaseTrainer.from_params(model=model, task_list=tasks, serialization_dir=serialization_dir, params=trainer_params) # Everything is set up, start the training train(trainer)
def main(argv=sys.argv[1:]): # Polyaxon experiment experiment = Experiment() argv.extend(['-f', get_outputs_path()]) cartpole_client.main(argv) experiment.log_metrics(score=cartpole_client.RESULTS[0]['score'])
def __init__(self, param): super().__init__() #polyaxon data_dir = os.path.join( list(get_data_paths().values())[0], "lung/JSRT/preprocessed/") logging.info('DATA DIR = ' + data_dir) output_path = get_outputs_path() self.loss_function = param[0] self.network = param[1] self.routing_type = param[2] self.batch_size = 1 self.learning_rates = [1, 1] self.max_iter = 300000 self.test_iter = 10000 self.disp_iter = 100 self.snapshot_iter = self.test_iter self.test_initialization = False self.current_iter = 0 self.num_labels = 6 self.data_format = 'channels_first' #WARNING: Capsule might not work with channel last ! self.channel_axis = 1 self.save_debug_images = False self.base_folder = data_dir ##input folder self.image_size = [128, 128] self.image_spacing = [1, 1] self.output_folder = output_path + self.network.__name__ + '_' + self.output_folder_timestamp( ) ##output save self.dataset = Dataset(image_size=self.image_size, image_spacing=self.image_spacing, num_labels=self.num_labels, base_folder=self.base_folder, data_format=self.data_format, save_debug_images=self.save_debug_images) self.dataset_train = self.dataset.dataset_train() self.dataset_train.get_next() self.dataset_val = self.dataset.dataset_val() self.dice_names = list( map(lambda x: 'dice_{}'.format(x), range(self.num_labels))) self.additional_summaries_placeholders_val = dict([ (name, create_summary_placeholder(name)) for name in self.dice_names ]) if self.network.__name__ is 'network_ud': self.net_file = './Lung_Segmentation/LungSeg/cnn_network.py' elif self.network.__name__ is 'SegCaps_multilabels': self.net_file = './Lung_Segmentation/LungSeg/SegCaps/SegCaps.py' else: self.net_file = './Lung_Segmentation/LungSeg/capsule_network.py' self.files_to_copy = ['main_train_and_test.py', self.net_file]
def main(args): """ Runs dataLayer processing scripts to turn raw dataLayer from (../raw) into cleaned dataLayer ready to be analyzed (saved in ../processed). """ ## Talk to Rune about how dataLayer is handle. config = TrainingConfig() config = update_config(args, config) ## For polyaxon if config.run_polyaxon: input_root_path = Path(get_data_paths()['data']) #'data' output_root_path = Path(get_outputs_path()) inpainting_data_path = input_root_path / 'inpainting' os.environ['TORCH_HOME'] = str(input_root_path / 'pytorch_cache') config.data_path = inpainting_data_path config.output_path = output_root_path config.polyaxon_experiment = Experiment() pathToData = str(input_root_path / '/workspace/data_landset8/testImages') else: pathToData = Path(r"C:\Users\Morten From\PycharmProjects\testDAta") logger = logging.getLogger(__name__) logger.info('making final dataLayer set from raw dataLayer') logger.info(pathToData) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") B_size = 1 beta_test_path_list = glob(str(pathToData) + "/*/") ImageDict = get_dataset(beta_test_path_list, batch_size=B_size) train = ImageDict['train_dataloader'] test = ImageDict['test_dataloader'] genPath = r'C:\Users\Morten From\PycharmProjects\Speciale\Master_Satelite_Image_Inpainting\models\New_400.pth' outputPathImages = Path( r'C:\Users\Morten From\PycharmProjects\Speciale\Master_Satelite_Image_Inpainting\images' ) testGen = UnetGenerator(3, 3, 8) testGen.load_state_dict(torch.load(genPath)) testGen = testGen.to(device) testGen.eval() iterater = 0 for real, SAR in tqdm(train, position=0, leave=True, disable=True): batchOfImages = real.to(device) batchOfImagesSAR = SAR.to(device) outputs = testGen(batchOfImagesSAR) modelHelper.save_tensor_batchSAR( batchOfImages, batchOfImagesSAR, outputs, B_size, Path.joinpath(outputPathImages, 'iter' + str(iterater))) iterater = iterater + 1
def get_polyaxon_resume_file(models_dir_name): output_path = Path(get_outputs_path()) models_path = output_path / models_dir_name experiments_path = [x for x in models_path.iterdir() if x.is_dir()][0] experiments = [x for x in experiments_path.iterdir() if x.is_dir()] # Take the latest experiment experiments.sort(reverse=True) resume_from = experiments[0] checkpoint_file = [ x for x in resume_from.iterdir() if x.suffixes[0] == ".pth" and x.name != "model_best.pth" ][0] return checkpoint_file
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available(), torch.cuda.is_available() assert (torch.backends.cudnn.enabled ), "Nvidia/Amp requires cudnn backend to be enabled." dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) config.output_path = Path(get_outputs_path()) if dist.get_rank() == 0: plx_exp = Experiment() plx_exp.log_params( **{ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) plx_exp.log_params(**get_params(config, TRAINVAL_CONFIG)) try: training( config, local_rank=local_rank, with_mlflow_logging=False, with_plx_logging=True, ) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()
def main(args): print("args: " + str(args)) print("preparing data ...") trainX, trainY = utils.getDataFromCsv(args.data, columns=args.columns, delimiter=args.delimiter) print("loaded {} records".format(len(trainX))) # create the transform xtrain, _ = utils.getVectorizer(trainX, useHashing=args.use_hashing, features=args.features) n_samples = xtrain.shape[0] train_len = (n_samples // 100) * args.sample_size print("training samples: " + str(train_len)) if (args.algorithm == "logistic"): clf = LogisticRegression(random_state=0, solver="lbfgs", multi_class='multinomial', max_iter=2000, verbose=0) elif (args.algorithm == "pagressive"): clf = PassiveAggressiveClassifier(max_iter=50, tol=1e-3) elif (args.algorithm == "sgd"): clf = SGDClassifier(alpha=.0001, max_iter=50, penalty="L2") clf.fit(xtrain[:train_len], trainY[:train_len]) print("predicting ...") predicted = clf.predict(xtrain[train_len:]) expected = trainY[train_len:] print("Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) # Train and eval the model with given parameters. # Polyaxon try: output_path = os.path.join(get_outputs_path(), "model.joblib") except: output_path = "model.joblib" print("dumping model parameters into '{}'".format(output_path)) joblib.dump(clf, output_path)
def define_prepare_mdl_path(plx): logdir_mdl = "mdl_chkpts/" outputs_path = get_outputs_path() if outputs_path is not None: # polyaxon behavior logdir_mdl = outputs_path + "/" + logdir_mdl if not os.path.exists(logdir_mdl): try: os.mkdir(logdir_mdl) except OSError: print("Creation of the directory %s failed" % logdir_mdl) else: print("Successfully created the directory %s " % logdir_mdl) file_path_mdl = logdir_mdl + plx.get('mdl_architecture') + '_' + plx.get( 'eng_kind') + ".hdf5" # >>> @sp - add untrained model path file_path_raw_mdl = logdir_mdl + plx.get( 'mdl_architecture') + '_' + 'untrained' + ".hdf5" return file_path_mdl, file_path_raw_mdl
def get_callbacks(model_type): # Prepare callbacks for model saving # Prepare model model saving directory model_name = 'cifar10_%s_model.{epoch:03d}.h5' % model_type # Polyaxon filepath = os.path.join(get_outputs_path(), model_name) checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=1, save_best_only=True) # Learning rate adjustment lr_scheduler = LearningRateScheduler(lr_schedule) lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6) return [checkpoint, lr_reducer, lr_scheduler]
def main(args): config = TrainingConfig() config = update_config(args, config) logger = logging.getLogger(__name__) if config.run_polyaxon: input_root_path = Path(get_data_paths()['data']) output_root_path = Path(get_outputs_path()) inpainting_data_path = input_root_path / 'inpainting' os.environ['TORCH_HOME'] = str(input_root_path / 'pytorch_cache') config.data_path = inpainting_data_path config.output_path = output_root_path model_path = inpainting_data_path / 'models' modelOutputPath = Path.joinpath(model_path, 'OutputModels') stores_output_path = config.output_path / 'data' / 'storedData' else: localdir = Path().absolute().parent modelOutputPath = Path.joinpath(localdir, 'OutputModels') stores_output_path = localdir / 'data' / 'storedData' #Import test data test = eval_model(config) test.run_eval(modelOutputPath, stores_output_path)
def train(experiment, max_features, maxlen, embedding_size, kernel_size, optimizer, filters, pool_size, lstm_output_size, log_learning_rate, batch_size, epochs): model = Sequential() model.add(Embedding(max_features, embedding_size, input_length=maxlen)) model.add(Dropout(0.25)) model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)) model.add(MaxPooling1D(pool_size=pool_size)) model.add(LSTM(lstm_output_size)) model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(OPTIMIZERS[optimizer](lr=10 ** log_learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), callbacks=[ PolyaxonKeras(experiment=experiment), ModelCheckpoint(get_outputs_path() + '/model') ]) score, accuracy = model.evaluate(x_test, y_test, batch_size=batch_size) return score, accuracy
def _plx_get_output_path(): from polyaxon_client.tracking import get_outputs_path return get_outputs_path()
def main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=1000, metavar='N', help='input batch size for training (default: 1000)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=15, metavar='N', help='number of epochs to train (default: 9)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') args = parser.parse_args() experiment = Experiment() logger = logging.getLogger('main') logger.setLevel(get_log_level()) use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") logger.info('%s', device) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('.', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('.', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) model_path = os.path.join(get_outputs_path(), 'model.p') state_path = os.path.join(get_outputs_path(), 'state.json') start = 1 if os.path.isfile(model_path): model.load_state_dict(torch.load(model_path)) logger.info('%s', 'Model Loaded') if os.path.isfile(state_path): with open(state_path, 'r') as f: data = json.load(f) start = data['epoch'] logger.info('%s', 'State Loaded') optimizer = optim.SGD(model.parameters(), lr=args.lr) with SummaryWriter(log_dir=get_outputs_path()) as writer: for epoch in range(start, args.epochs + 1): train(epoch, writer, experiment, args, model, device, train_loader, optimizer) test(epoch, writer, experiment, args, model, device, test_loader) torch.save(model.state_dict(), model_path) with open(state_path, 'w') as f: data = { 'epoch' : epoch } json.dump(data, f)
def __call__(self, parser, namespace, value, option_string=None): if value == RuntimeEnvironment.POLYAXON: save_dir = get_outputs_path() setattr(namespace, "save_dir", save_dir)
else: log_level = 'INFO' tf.logging.set_verbosity(log_level) set_logging(get_log_level()) experiment = Experiment() vm_paths = list(get_data_paths().values())[0] data_paths = "{}/SSD/tfrecords".format(vm_paths) checkpointpath = "{}/SSD.checkpoints/ssd_300_vgg.ckpt".format(vm_paths) TRAIN_DIR = get_outputs_path() slim = tf.contrib.slim DATA_FORMAT = 'NHWC' # =========================================================================== # # SSD Network flags. # =========================================================================== # tf.app.flags.DEFINE_float('loss_alpha', 1., 'Alpha parameter in the loss function.') tf.app.flags.DEFINE_float('negative_ratio', 3., 'Negative ratio in the loss function.') tf.app.flags.DEFINE_float('match_threshold', 0.5, 'Matching threshold in the loss function.')
print('noisy_network: ' + str(noisy_network)) # ==================================================================================================# # PATHS SETUP # # ==================================================================================================# run_name = args.run_name if cluster: data_paths = get_data_paths() patient_path = data_paths[ 'data1'] + "/HHase_Robotic_RL/NAS_Sacrum_Scans/Patient_files/" patient_data_path = data_paths[ 'data1'] + "/HHase_Robotic_RL/NAS_Sacrum_Scans/" load_model_path = data_paths[ 'data1'] + "/HHase_Robotic_RL/Models/model_best.pth" output_path = get_outputs_path() tensorboard_path = get_outputs_path() experiment = Experiment() else: patient_path = "./../Data/Patient_files/" patient_data_path = "./../Data/" output_path = './' tensorboard_path = './runs/' load_model_path = "./../Data/pretrained_model/model_best.pth" model_save_path = output_path + "/models/{}.pt".format(run_name) #load_model_path = output_path + "/models/{}.pt".format(run_name) datetime = datetime.now() tensorboard_name = 'Nov' + datetime.strftime( "%d_%H-%M-%S") + '_Rachet-' + run_name
num_iterations = arguments.pop('num_iterations') if activation == 'relu': activation = tf.nn.relu elif activation == 'sigmoid': activation = tf.nn.sigmoid elif activation == 'linear': activation = None experiment = Experiment() if distributed: # Check if we need to export TF_CLUSTER experiment.get_tf_config() estimator = tf.estimator.Estimator(get_model_fn( learning_rate=learning_rate, dropout=dropout, activation=activation), model_dir=get_outputs_path()) # Train the Model input_fn = tf.estimator.inputs.numpy_input_fn( x={'images': mnist.train.images}, y=mnist.train.labels, batch_size=batch_size, num_epochs=num_epochs, shuffle=True) for i in range(num_iterations): estimator.train(input_fn, steps=num_steps) # Evaluate the Model input_fn = tf.estimator.inputs.numpy_input_fn( x={'images': mnist.test.images},
def train_net(net, epochs=5, batch_size=1, lr=0.003, val_percent=0.20, loss_lambda=5, save_cp=True, gpu=False, img_scale=0.5, expositions_num=15, logg_freq=15, tb=False, w_decay=0.0005, use_notifications=False, polyaxon=False, outputs_path='checkpoints'): # === Localize training data =================================================== if polyaxon: data_paths = get_data_paths() dir_checkpoints = get_outputs_path() dataSets_dir = os.path.join(data_paths['data1'], 'eprado', 'USLDR-DataSet') #dataSets_dir = os.path.join(data_paths['data1'] , 'eprado', 'LDR_DataSet') else: dataSets_dir = os.path.join(wk_dir, "LDR_DataSet") dir_checkpoints = os.path.join(wk_dir, outputs_path) print('Dataset_dir', dataSets_dir) print('Outputs_path', dir_checkpoints) experiment_id = datetime.datetime.now().strftime('%d%m_%H%M_') experiment_name = 'ExpandnetL_psn_{}_bs{}_lr{}_exps{}'.format( experiment_id, batch_size, lr, expositions_num) dir_img = os.path.join(dataSets_dir, 'Org_images/') dir_compressions = os.path.join(dataSets_dir, 'c_images/') dir_mask = os.path.join(dataSets_dir, 'c_images/') #if tb: #dummy_input = torch.rand(1, 3, 128, 128) #writer.add_graph(net, (dummy_input,)) #writer.close() # === Load Training/Validation data ===================================================== ids = get_ids(dir_compressions) # Split into train test idsset = list(ids) kf = KFold(n_splits=5, shuffle=False) #print('Train splits: ',kf.get_n_splits(dataset)) best_psnr_m = 0 best_psnr_hvs = 0 #for train_index, test_index in kf.split(idsset): iddataset = split_train_val(idsset, expositions_num, val_percent) #test_set = [] #for im_id in test_index: # for e in range(expositions_num): # test_set.append(idsset[im_id]) N_train = len(iddataset['train']) N_val = len(iddataset['val']) N_test = 0 #len(test_set) #=====CHOOSE Loss Criterion============================================================= #criterion = nn.MSELoss(reduction='mean') criterion = ExpandNetLoss(loss_lambda=loss_lambda) optimizer = optim.Adagrad(net.parameters(), lr=lr, lr_decay=0.000001, weight_decay=w_decay) #optimizer = optim.SGD(net.parameters(), # lr=lr, # momentum=0.9, # weight_decay=0.0005) since = time.time() print(''' Training SETUP: Epochs: {0:} Batch size: {1:} Optimizer: Adagrad Learning rate: {2:} Weight decay: {3:} Training size: {4:} Validation size: {5:} Test size: {6:} Checkpoints: {7:} CUDA: {8:} '''.format(epochs, batch_size, lr, w_decay, N_train, N_val, N_test, str(save_cp), str(gpu))) train_dataset = HdrDataset(iddataset['train'], dir_compressions, dir_mask, expositions_num) val_dataset = HdrDataset(iddataset['val'], dir_compressions, dir_mask, expositions_num) #test_dataset = HdrDataset(test_set, dir_compressions, dir_mask,expositions_num) train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False) val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False) #test_data_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=True) best_hvsm = 0.0 global_psnr_m = [] global_psnr_hvs = [] for epoch in range(epochs): print('\n') print('{}{}{}'.format('+', '=' * 78, '+')) print('| Starting epoch {}/{}. {}'.format(epoch + 1, epochs, (' ' * 57) + '|')) print('{}{}{}'.format('|', '-' * 78, '|')) begin_of_epoch = time.time() tot_steps = math.trunc(N_train / batch_size) net.train() train_loss = 0 losses = [] val_loss = 0 step = 0 train_sample = [] train_acc = 0 val_hvsm = 0 val_hvs = 0 model_pnsr_m = 0 for i, b in enumerate(train_data_loader): step += 1 imgs, true_masks, imgs_ids = b['input'], b['target'], b['id'] #print(i, b['input'].size(), b['target'].size()) #input: [15, 3, 224, 224]), target: [15, 3, 224, 224] #print('>>>>>>> Input max: ' , torch.max(imgs[0])) #print('>>>>>>> mask max : ', torch.max(true_masks[0])) if gpu: imgs = imgs.cuda() true_masks = true_masks.cuda() else: print(' GPU not available') # Predicted mask images optimizer.zero_grad() prediction = net(imgs) #prediction shape: [B, 3, 224, 224] #cost, cost_input_output = Hdr_loss(imgs, true_masks, prediction, sep_loss=False, gpu=gpu, tb=tb) cost = criterion(prediction, true_masks) #loss is torch tensor losses.append(cost.item()) train_loss = np.mean(losses) cost.backward() optimizer.step() if step == 1 or step % logg_freq == 0: #print('| Step: {0:}, cost:{1:}, Train Loss:{2:.9f}, Train Acc:{3:.9f}'.format(step,cost, train_loss,train_acc/step)) print('| Step: {0:}, cost:{1:}, Train Loss:{2:.9f}'.format( step, cost, train_loss)) #Last Step of this Epoch if step == math.trunc(tot_steps): num_in_batch = random.randrange(imgs.size(0)) train_sample_name = imgs_ids[num_in_batch] train_sample = [ imgs[num_in_batch], true_masks[num_in_batch], prediction[num_in_batch] ] t_exp_name = 'Train_' + experiment_name saveTocheckpoint(dir_checkpoints, t_exp_name, train_sample_name, epoch, train_sample[0], train_sample[1], train_sample[2]) if tb: print( '| saving train step {0:} sample : input,target & pred' .format(step)) grid = torchvision.utils.make_grid(train_sample, nrow=3) writer.add_image('train_sample', grid, 0) #if epoch == 1 or epoch % 15 == 0 or epoch == epochs: val_loss, val_hvsm, val_hvs = eval_hdr_net(net, dir_checkpoints, experiment_name, val_data_loader, criterion, epoch, gpu, batch_size, expositions_num=15, tb=tb) if tb: writer.add_scalar('training_loss: ', train_loss, epoch) writer.add_scalar('validation_loss', val_loss, epoch) writer.add_scalar('val_hvsm', val_hvsm, epoch) writer.add_scalar('val_hvs', val_hvs, epoch) writer.add_scalars('losses', { 'training_loss': train_loss, 'val_loss': val_loss }, epoch) if polyaxon: experiment.log_metrics(step=epoch, training_loss=train_loss, validation_loss=val_loss, val_hvsm=val_hvsm, val_hvs=val_hvs) print('{}{}{}'.format('+', '=' * 78, '+')) print('| {0:} Epoch {1:} finished ! {2:}|'.format( ' ' * 28, (epoch + 1), ' ' * 29)) print('{}{}{}'.format('+', '-' * 78, '+')) print('| Summary: Train Loss: {0:0.07}, Val Loss:{1:}'.format( train_loss, val_loss)) print('| Avrg psnr-hvs_m :{0:0.04},Avrg psnr-hvs :{1:0.04}'. format(val_hvsm, val_hvs)) time_epoch = time.time() - begin_of_epoch print('| Epoch ETC: {:.0f}m {:.0f}s'.format(time_epoch // 60, time_epoch % 60)) print('{}{}{}'.format('+', '=' * 78, '+')) if save_cp and (val_hvsm > best_hvsm): best_hvsm = val_hvsm model_path = os.path.join(dir_checkpoints, 'BestCP.pth') torch.save(net.state_dict(), model_path) print('Checkpoint saved !') global_psnr_hvs.append(val_hvs) global_psnr_m.append(val_hvsm) ''' test_psnr_m, test_psnr_hvs = test_hdr_net(model_path,dir_checkpoints, experiment_name, test_data_loader, criterion,gpu,tb) if save_cp and (test_psnr_m > best_psnr_m): best_psnr_m = test_psnr_m best_model_path = os.path.join(dir_checkpoints, 'Best_CP.pth') torch.save(net.state_dict(),best_model_path) print('Best model saved !') ''' print('>' * 80) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Final Average psnr-hvs_m: {:.0f}, psnr-hvs: {:.0f}'.format( np.mean(global_psnr_m), np.mean(global_psnr_hvs))) if tb: writer.close() if use_notifications: end_msg = "train.py finished at: {}(".format( str(datetime.datetime.now())) push = pb.push_note("usHDR: Finish", end_msg)
tone_map, create_tmo_param_from_args, ) try: print('Loading Tensorboard') from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() print('Using Tensorboard in train.py') except ImportError: print('Counld not Import module Tensorboard') try: print('loading tensorboard X') from tensorboardX import SummaryWriter try: outputs_path = get_outputs_path() writer = SummaryWriter(outputs_path) experiment = Experiment() print('Using Tensorboard X') except ImportError: writer = SummaryWriter() print('Using Tensorboard X') except ImportError: print('Could not import TensorboardX') # Setup date/ time currentDT = datetime.datetime.now() # FLAGS # === Settings =================================================================
def run(config, logger): plx_logger = PolyaxonLogger() set_seed(config.seed) plx_logger.log_params(**{ "seed": config.seed, "batch_size": config.batch_size, "pytorch version": torch.__version__, "ignite version": ignite.__version__, "cuda version": torch.version.cuda }) device = config.device non_blocking = config.non_blocking prepare_batch = config.prepare_batch def stats_collect_function(engine, batch): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_ohe = to_onehot(y.reshape(-1), config.num_classes) class_distrib = y_ohe.mean(dim=0).cpu() class_presence = (class_distrib > 1e-3).cpu().float() num_classes = (class_distrib > 1e-3).sum().item() engine.state.class_presence += class_presence engine.state.class_presence -= (1 - class_presence) return { "class_distrib": class_distrib, "class_presence": engine.state.class_presence, "num_classes": num_classes } stats_collector = Engine(stats_collect_function) ProgressBar(persist=True).attach(stats_collector) @stats_collector.on(Events.STARTED) def init_vars(engine): engine.state.class_presence = torch.zeros(config.num_classes) log_dir = get_outputs_path() if log_dir is None: log_dir = "output" tb_logger = TensorboardLogger(log_dir=log_dir) tb_handler = tb_output_handler(tag="training", output_transform=lambda x: x) tb_logger.attach(stats_collector, log_handler=tb_handler, event_name=Events.ITERATION_COMPLETED) stats_collector.run(config.train_loader, max_epochs=1) remove_handler(stats_collector, tb_handler, Events.ITERATION_COMPLETED) tb_logger.attach(stats_collector, log_handler=tb_output_handler(tag="validation", output_transform=lambda x: x), event_name=Events.ITERATION_COMPLETED) stats_collector.run(config.val_loader, max_epochs=1)
def main(config): logging.basicConfig(level=logging.INFO) logging.info("STARTING PROGRAM") if config.TRAIN.POLYAXON: from polyaxon_client.tracking import Experiment, get_data_paths, get_outputs_path data_dir = get_data_paths() config.DATASET.OUTPUT_PATH = get_outputs_path() config.DATASET.PATH = os.path.join(data_dir['data1'], config.DATASET.PATH_NAS) model_path = os.path.join(data_dir['data1'], config.MODEL.PRETRAINED_NAS) logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler( logging.FileHandler( os.path.join(config.DATASET.OUTPUT_PATH, 'Heatmaps_from_human_joints.log'))) # Polyaxon experiment = Experiment() else: logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler( logging.FileHandler( os.path.join(config.DATASET.OUTPUT_PATH, 'Heatmaps_Resnet101.log'))) model_path = config.MODEL.PRETRAINED trainloader, valloader = utils.load_split_train_val( config.DATASET.PATH, "train", "validation", config) print('batch size', config.TRAIN.BATCH_SIZE) print('dataset', config.DATASET.PATH_NAS) print("weights", config.TRAIN.UPDATE_WEIGHTS) print("Model: ", model_path) print("LR: ", config.TRAIN.LR) model = utils.model_pose_resnet.get_pose_net(model_path, is_train=True) model.eval() for name, parameter in model.named_parameters(): parameter.requires_grad = config.TRAIN.UPDATE_WEIGHTS if "deconv" in name or "final" in name: parameter.requires_grad = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") optimizer = optim.Adam(model.parameters(), lr=config.TRAIN.LR) model.to(device) # Decay LR by a factor of 0.1 every 3 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.01) writer = SummaryWriter(config.DATASET.OUTPUT_PATH) best_acc = 0 for epoch in range(config.TRAIN.END_EPOCH): criterion = nn.MSELoss() logger.info('Epoch {}/{}'.format(epoch, config.TRAIN.END_EPOCH - 1)) logger.info('-' * 10) acc = utils.AverageMeter() batch_loss = utils.AverageMeter() for i, (inputs, labels) in enumerate(trainloader): inputs, labels = inputs.to(device), labels.to(device) # print(summary(model, tuple(inputs.size())[1:])) logps = model.forward(inputs) criterion = nn.MSELoss() loss = criterion(logps, labels.float()) batch_loss.update(loss.item(), inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() _, avg_acc, cnt, pred, target, dists = utils.accuracy( logps.detach().cpu().numpy(), labels.detach().cpu().numpy(), thr=config.TRAIN.THRESHOLD) print("Current batch accuracy: ", avg_acc) acc.update(avg_acc, cnt) print("Batch {} train accurcy: {}, loss: {}".format( i, acc.avg, batch_loss.avg)) writer.add_scalar('Loss/train', float(batch_loss.avg), epoch) val_acc = run_val(model, valloader, device, criterion, writer, epoch, config) logger.info( 'Train Loss: {:.4f} Train Acc: {:.4f} Val Acc: {:.4f}'.format( batch_loss.avg, acc.avg, val_acc)) if val_acc > best_acc: best_acc = val_acc logging.info("best val at epoch: " + str(epoch)) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': batch_loss.avg, }, os.path.join(config.DATASET.OUTPUT_PATH, "best_model.pt")) if epoch % 250 == 0: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': batch_loss.avg, }, os.path.join(config.DATASET.OUTPUT_PATH, "model" + str(epoch) + ".pt")) logger.info('Best val Acc: {:4f}'.format(best_acc))
parser.add_argument('--batch-norm-epsilon', type=float, default=1e-5, help='Epsilon for batch norm.') args = parser.parse_args() if args.num_gpus < 0: raise ValueError( 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.' ) if args.num_gpus == 0 and args.variable_strategy == 'GPU': raise ValueError( 'num-gpus=0, CPU must be used as parameter server. Set' '--variable-strategy=CPU.') if (args.num_layers - 2) % 6 != 0: raise ValueError('Invalid --num-layers parameter.') if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: raise ValueError('--train-batch-size must be multiple of --num-gpus.') if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: raise ValueError('--eval-batch-size must be multiple of --num-gpus.') # Polyaxon data_dir = os.path.join( list(get_data_paths().values())[0], 'cifar-10-data') # We create data for the project if it does not exists if not os.path.exists(os.path.join(data_dir, 'train.tfrecords')): generate_data(data_dir) # Polyaxon train(job_dir=get_outputs_path(), data_dir=data_dir, **vars(args))
type=str, dest="atlas_dir", default='/data/PMSD_voxelmorph/atlas128/') parser.add_argument("--model", type=str, dest="model", choices=['vm1', 'vm2'], default='vm2', help="voxelmorph 1 or 2") parser.add_argument("--init_model_file", type=str, dest="init_model_file", default='/outputs/agrund/PMSD_voxelmorph/1000.ckpt', help="model weight file") parser.add_argument("--saveDir", type=str, dest="saveDir", default=get_outputs_path()) parser.add_argument("--nr_val_data", type=int, dest="nr_val_data", default=4) args, unknown = parser.parse_known_args() test(**vars(args))
def main(args): """ Runs dataLayer processing scripts to turn raw dataLayer from (../raw) into cleaned dataLayer ready to be analyzed (saved in ../processed). """ ## Talk to Rune about how dataLayer is handle. config = TrainingConfig() config = update_config(args, config) ## For polyaxon if config.run_polyaxon: input_root_path = Path(get_data_paths()['data']) #'data' output_root_path = Path(get_outputs_path()) inpainting_data_path = input_root_path / 'inpainting' os.environ['TORCH_HOME'] = str(input_root_path / 'pytorch_cache') config.data_path = inpainting_data_path config.output_path = output_root_path config.polyaxon_experiment = Experiment() pathToData = str(input_root_path / '/workspace/data_landset8/testImages') else: pathToData = Path(r"C:\Users\Morten From\PycharmProjects\testDAta") testPathData = Path( r'/workspace/data_landset8/unzipped/GrassCrops/BC/LC81820302014180LGN00' ) #S1A_20201005_034656_DSC_109_RGBsar_cog.tif #S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B02_cog #S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B03_cog.tif #S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B04_cog.tif logger = logging.getLogger(__name__) logger.info('making final dataLayer set from raw dataLayer') logger.info(pathToData) ImageDict = get_dataset(pathToData, batch_size=config.batch_size) train = ImageDict['train_dataloader'] test = ImageDict['test_dataloader'] #Kører begge på Wgan loop lige nu if config.model_name == 'PartialConvolutions': curtraingModel = trainInpaintingWgan(train, test, generator, discriminator, config) local_model_path = curtraingModel.trainGAN() elif config.model_name == 'PartialConvolutionsWgan': curtraingModel = trainInpaintingWgan(train, test, generator, criticWgan, config) local_model_path = curtraingModel.trainGAN() #local_model_path = Path(r"C:\Users\panda\PycharmProjects\Image_Inpainting_Sat\Master_Satelite_Image_Inpainting\OutputModels\PartialConvolutionsWgan_200.pt") if config.run_polyaxon: model_path = inpainting_data_path / 'models' modelOutputPath = Path.joinpath(model_path, 'OutputModels') stores_output_path = config.output_path / 'data' / 'storedData' else: localdir = Path().absolute().parent modelOutputPath = Path.joinpath(localdir, 'OutputModels') stores_output_path = localdir / 'data' / 'storedData' curevalModel = eval_model(config) curevalModel.run_eval(modelOutputPath, stores_output_path, model_path=local_model_path, test_dataloader=test)
def train(gpu, data_dir, size, atlas_dir, lr, n_iter, data_loss, model, reg_param, batch_size, n_save_iter, model_dir, nr_val_data): """ model training function :param gpu: integer specifying the gpu to use :param data_dir: folder with npz files for each subject. :param size: int desired size of the volumes: [size,size,size] :param atlas_dir: direction to atlas folder :param lr: learning rate :param n_iter: number of training iterations :param data_loss: data_loss: 'mse' or 'ncc :param model: either vm1 or vm2 (based on CVPR 2018 paper) :param reg_param: the smoothness/reconstruction tradeoff parameter (lambda in CVPR paper) :param batch_size: Optional, default of 1. can be larger, depends on GPU memory and volume size :param n_save_iter: Optional, default of 500. Determines how many epochs before saving model version. :param model_dir: the model directory to save to :param nr_val_data: number of validation examples that should be separated from the training data """ os.environ["CUDA_VISIBLE_DEVICES"] = gpu device = "cuda" vol_size = np.array([size, size, size]) # Get all the names of the training data vol_names = glob.glob(os.path.join(data_dir, '*.nii')) #random.shuffle(vol_names) #test_vol_names = vol_names[-nr_val_data:] test_vol_names = vol_names[:nr_val_data] #test_vol_names = [i for i in test_vol_names if "L2-L4" in i] print( 'these volumes are separated from the data and serve as validation data : ' ) print(test_vol_names) #train_vol_names = vol_names[:-nr_val_data] train_vol_names = vol_names[nr_val_data:] #train_vol_names = [i for i in train_vol_names if "L2-L4" in i] random.shuffle(train_vol_names) writer = SummaryWriter(get_outputs_path()) # Prepare the vm1 or vm2 model and send to device nf_enc = [16, 32, 32, 32] if model == "vm1": nf_dec = [32, 32, 32, 32, 8, 8] elif model == "vm2": nf_dec = [32, 32, 32, 32, 32, 16, 16] else: raise ValueError("Not yet implemented!") model = cvpr2018_net(vol_size, nf_enc, nf_dec) model.to(device) # Set optimizer and losses opt = Adam(model.parameters(), lr=lr) sim_loss_fn = losses.ncc_loss if data_loss == "ncc" else losses.mse_loss grad_loss_fn = losses.gradient_loss # data generator train_example_gen = datagenerators.example_gen(train_vol_names, atlas_dir, size, batch_size) # Training loop. for i in range(n_iter): # Save model checkpoint and plot validation score if i % n_save_iter == 0: save_file_name = os.path.join(model_dir, '%d.ckpt' % i) torch.save(model.state_dict(), save_file_name) # load validation data val_example_gen = datagenerators.example_gen( test_vol_names, atlas_dir, size, 4) val_data = next(val_example_gen) val_fixed = torch.from_numpy(val_data[1]).to(device).float() val_fixed = val_fixed.permute(0, 4, 1, 2, 3) val_moving = torch.from_numpy(val_data[0]).to(device).float() val_moving = val_moving.permute(0, 4, 1, 2, 3) #create validation data for the model val_warp, val_flow = model(val_moving, val_fixed) #calculte validation score val_recon_loss = sim_loss_fn(val_warp, val_fixed) val_grad_loss = grad_loss_fn(val_flow) val_loss = val_recon_loss + reg_param * val_grad_loss #tensorboard writer.add_scalar('Loss/Test', val_loss, i) #prints print('validation') print("%d,%f,%f,%f" % (i, val_loss.item(), val_recon_loss.item(), val_grad_loss.item()), flush=True) # Generate the moving images and convert them to tensors. data_for_network = next(train_example_gen) input_fixed = torch.from_numpy(data_for_network[1]).to(device).float() input_fixed = input_fixed.permute(0, 4, 1, 2, 3) input_moving = torch.from_numpy(data_for_network[0]).to(device).float() input_moving = input_moving.permute(0, 4, 1, 2, 3) # Run the data through the model to produce warp and flow field warp, flow = model(input_moving, input_fixed) print("warp_and_flow_field") print(warp.size()) print(flow.size()) # Calculate loss recon_loss = sim_loss_fn(warp, input_fixed) grad_loss = grad_loss_fn(flow) loss = recon_loss + reg_param * grad_loss #tensorboard writer.add_scalar('Loss/Train', loss, i) print("%d,%f,%f,%f" % (i, loss.item(), recon_loss.item(), grad_loss.item()), flush=True) # Backwards and optimize opt.zero_grad() loss.backward() opt.step()