def main(): args, batch_size, current_dir, max_iterations, voxel_model, z_dim, error, validate = parse_args( ) for z in z_dim: DATASET_PATH, OUT_DIR, model_name = create_work_dirs( args, batch_size, current_dir, voxel_model, z) # Create torch device for GPU computing is_cuda = (torch.cuda.is_available()) device = torch.device("cuda" if is_cuda else "cpu") # Dataset loading test_loader, train_loader, vis_train_loader = generate_datasets( DATASET_PATH, batch_size) # Tensorboard initializing logger = SummaryWriter( os.path.join( OUT_DIR, datetime.datetime.now().strftime('logs_%Y_%m_%d_%H_%M_%S' + model_name[5:-3]))) # Model and tr checkpoint_io, epoch_it, it, trainer = load_trainer_from_model( OUT_DIR, device, model_name, z) train_loop(model_name, checkpoint_io, test_loader, train_loader, trainer, vis_train_loader, logger, error=error, max_iterations=max_iterations, epoch_it=epoch_it, it=it, checkpoint_every=500, eval_network=validate, pears=validate, vis=validate)
def main(argv=None): data.maybe_download_and_extract(FLAGS.data_dir) # If cluster configuration flags were provided, save them if FLAGS.ps_hosts != '' and FLAGS.worker_hosts != '': ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") cluster_config = {"ps": ps_hosts, "worker": worker_hosts} # Save cluster configuration with open('cluster.json', 'w') as f: json.dump(cluster_config, f) print('Cluster configuration saved.') else: try: # Read cluster configuration with open('cluster.json', 'r') as f: cluster_config = json.load(f) except (OSError, IOError) as e: print("No cluster configuration found: you need to provide at " \ "least once the two lists of ps and worker hosts") return if FLAGS.job_name == '': print('Pass this script a job name (ps or worker) to start a ' \ 'training session.') return # Create a cluster cluster = tf.train.ClusterSpec(cluster_config) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": train.train_loop(cluster=cluster, master=server.target, task_index=FLAGS.task_index)
def main(): t = time() conf_code = extract_config_code() check_flags() print(get_model_info_as_str()) data = SiameseModelData(FLAGS.dataset_train) dist_sim_calculator = DistSimCalculator(FLAGS.dataset_train, FLAGS.ds_metric, FLAGS.ds_algo) model = create_model(FLAGS.model, data.input_dim(), data, dist_sim_calculator) os.environ["CUDA_VISIBLE_DEVICES"] = str(FLAGS.gpu) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) saver = Saver(sess) sess.run(tf.compat.v1.global_variables_initializer()) train_costs, train_times = train_loop(data, model, saver, sess) test(data, model, saver, sess) saver.save_conf_code(conf_code) overall_time = convert_long_time_to_str(time() - t) print(overall_time, saver.get_log_dir()) saver.save_overall_time(overall_time) return train_costs, train_times
from utils import plot_samples from train import train_loop from test import test_loop import torch.optim as optim import torch.nn as nn #model = Model7() model = ResNet18() show_model_summary(model.to(DEVICE), (3, 32, 32)) # Constants, put in config epochs = 50 cuda_batch_size=128 cpu_batch_size = 4 num_workers = 4 # ToDo: Create separate transforms for train and test... #transforms = model7_transforms() (train_loader, test_loader, classes) = load_cifar10(model9_resnet_train_transforms(), model9_resnet_test_transforms(), cuda_batch_size, cpu_batch_size, num_workers) plot_samples(train_loader) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.009, momentum=0.9) train_loop(epochs, train_loader, model, DEVICE, optimizer, criterion, None, False) test_loop(test_loader, model, DEVICE, criterion)
outpath = args.load_modelPath if torch.cuda.is_available(): encoder.load_state_dict(torch.load(f'{outpath}/weights_encoder/epoch_{args.load_epoch}_encoder_weights.pth')) decoder.load_state_dict(torch.load(f'{outpath}/weights_decoder/epoch_{args.load_epoch}_decoder_weights.pth')) else: encoder.load_state_dict(torch.load(f'{outpath}/weights_encoder/epoch_{args.load_epoch}_encoder_weights.pth', map_location=torch.device('cpu'))) encoder.load_state_dict(torch.load(f'{outpath}/weights_decoder/epoch_{args.load_epoch}_decoder_weights.pth', map_location=torch.device('cpu'))) # Create new model else: outpath = f"{args.save_dir}/{gen_fname(args)}" if args.customSuffix is not None: outpath = f"{outpath}_{args.customSuffix}" make_dir(outpath) with open(f"{outpath}/args_cache.json", "w") as f: json.dump(vars(args), f) # Training optimizer_encoder = torch.optim.Adam(encoder.parameters(), args.lr) optimizer_decoder = torch.optim.Adam(decoder.parameters(), args.lr) train_avg_losses, valid_avg_losses, train_dts, valid_dts = train_loop(args, encoder, decoder, train_loader, valid_loader, optimizer_encoder, optimizer_decoder, outpath, device=device) '''Plotting evaluation results''' plot_eval_results(args, data=(train_avg_losses, valid_avg_losses), data_name="Losses", outpath=outpath) plot_eval_results(args, data=(train_dts, valid_dts), data_name="Time durations", outpath=outpath) plot_eval_results(args, data=[train_dts[i] + valid_dts[i] for i in range(len(train_dts))], data_name="Total time durations", outpath=outpath) print("Completed!")
help='Silence print statements during training') parser.add_argument('--test', action='store_true', help='Just render the env, no training') if __name__ == '__main__': args = parser.parse_args() assert args.n % 2 == 0 if args.small_net and args.env_name not in [ 'CartPole-v0', 'CartPole-v1', 'MountainCar-v0' ]: args.env_name = 'CartPole-v1' print('Switching env to CartPole') env = create_atari_env(args.env_name) chkpt_dir = 'checkpoints/%s/' % args.env_name if not os.path.exists(chkpt_dir): os.makedirs(chkpt_dir) synced_model = ES(env.observation_space.shape[0], env.action_space, args.small_net) for param in synced_model.parameters(): param.requires_grad = False if args.restore: state_dict = torch.load(args.restore) synced_model.load_state_dict(state_dict) if args.test: render_env(args, synced_model, env) else: train_loop(args, synced_model, env, chkpt_dir)
def run_all_train_loops(env_name, cur_models, num_env_steps, device): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Train loop params num_updates = int(num_env_steps) // num_steps // num_processes lr_decay_horizon = int(10e6) // num_steps // num_processes log_interval = 10 eval_interval = 10 time_limit = gym.make( env_name).spec.tags['wrapper_config.TimeLimit.max_episode_steps'] # All methods to be tested ppo_variants = ['PPO', 'PPONormObs', 'PPONormObsRew'] all_methods = list(cur_models.keys()) + ppo_variants n_launches = 5 path = './' for name in all_methods: np.random.seed(42) torch.manual_seed(42) if not os.path.exists(os.path.join(path, env_name, name)): os.makedirs(os.path.join(path, env_name, name)) for i in range(n_launches): if name == 'PPONormObs': envs = make_vec_envs(env_name, 1, num_processes, None, device, False, normalize_obs=True) elif name == 'PPONormObsRew': envs = make_vec_envs(env_name, 1, num_processes, gamma, device, False, normalize_obs=True) else: envs = make_vec_envs(env_name, 1, num_processes, gamma, device, False, normalize_obs=False) if name in ['ForwardDynLoss', 'InverseDynLoss', 'ICM', 'RND']: if name == 'RND': cur_model = cur_models[name][0]( envs.observation_space.shape[0], num_processes) cur_model.to(device) cur_model.init_obs_norm( random_observations(env_name, size=2000, device=device)) else: cur_model = cur_models[name][0]( envs.observation_space.shape[0], envs.action_space) cur_model.to(device) curiosity_module = CuriosityModule( cur_model, rew_coef=cur_models[name][1]) else: curiosity_module = None print('Environment: {}, method: {}, {}'.format(env_name, name, i)) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': False }).to(device) agent = PPO(actor_critic, clip_param, ppo_epochs, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) stats = train_loop(agent, envs, env_name, num_updates, num_steps, curiosity_module=curiosity_module, save_interval=save_interval, eval_interval=eval_interval, log_interval=log_interval, time_limit=time_limit, curiosity_rew_after=0, curiosity_rew_before=None, use_linear_lr_decay=True, lr_decay_horizon=lr_decay_horizon, callbacks=None) with open(os.path.join(path, env_name, name, str(i)), 'wb') as f: pickle.dump(stats, f)
from transforms import model9_resnet_train_transforms, model9_resnet_test_transforms from utils import plot_samples from train import train_loop from test import test_loop import torch.optim as optim import torch.nn as nn #model = Model7() model = ResNet18() show_model_summary(model.to(DEVICE), (3, 32, 32)) # Constants, put in config epochs = 50 cuda_batch_size = 128 cpu_batch_size = 4 num_workers = 4 # ToDo: Create separate transforms for train and test... #transforms = model7_transforms() (train_loader, test_loader, classes) = \ dataloaders.load_cifar10(model9_resnet_train_transforms(), model9_resnet_test_transforms(), cuda_batch_size, cpu_batch_size, num_workers) plot_samples(train_loader) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.009, momentum=0.9) train_loop(epochs, train_loader, model, DEVICE, optimizer, criterion) test_loop(test_loader, model, DEVICE, criterion)
# Similar to our train script, but we do this k times for k, datasets in enumerate(iterate_folds(fold_sets)): train, val, test = datasets model = BeatNet(downbeats=args.downbeats) if cuda_device is not None: model.cuda(args.cuda_device) output_file = make_fold_output_name(args.output_file, k) train_loader, val_loader, test_loader = make_data_loaders( (train, val, test), batch_size=args.batch_size) train_loop(model, train_loader, val_loader=val_loader, num_epochs=args.num_epochs, cuda_device=cuda_device, output_file=output_file, davies_stopping_condition=args.davies_stopping_condition, fold=k) if args.output_file is not None: save_model(model, output_file) if args.dataset_output_file is not None: save_dir = make_fold_output_name(args.dataset_output_file, k) save_datasets((train, val, test), save_dir) test_model(model, test_loader, cuda_device=cuda_device)
# instantiate model (and restore if needed) synced_model = ES(env.observation_space, env.action_space, use_a3c_net=args.a3c_net, use_virtual_batch_norm=args.virtual_batch_norm) for param in synced_model.parameters(): param.requires_grad = False if args.restore: state_dict = torch.load(args.restore) synced_model.load_state_dict(state_dict) # compute batch for virtual batch normalization if args.virtual_batch_norm and not args.test: # print('Computing batch for virtual batch normalization') virtual_batch = gather_for_virtual_batch_norm( env, batch_size=args.virtual_batch_norm) virtual_batch = torchify(virtual_batch, unsqueeze=False) else: virtual_batch = None # train or test as requested if args.test: render_env(args, synced_model, env) else: train_loop(args, synced_model, env, chkpt_dir, virtual_batch=virtual_batch)
max_to_keep=MAX_CKPT_TO_SAVE) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored: {}!!'.format( ckpt_manager.latest_checkpoint)) train_loop(train_x, train_y, test_x, test_y, generator_g, generator_f, discriminator_x, discriminator_y, generator_g_optimizer, generator_f_optimizer, discriminator_x_optimizer, discriminator_y_optimizer, ckpt_manager, batch_size=batch_size, epochs=EPOCHS, num_epochs_to_save=NUM_EPOCHS_TO_SAVE) elif mode == 'predict': if checkpoint_path == None: exit('Error: Please specify checkpoint path') ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=MAX_CKPT_TO_SAVE) if not ckpt_manager.latest_checkpoint: