def main():
    args, batch_size, current_dir, max_iterations, voxel_model, z_dim, error, validate = parse_args(
    )

    for z in z_dim:
        DATASET_PATH, OUT_DIR, model_name = create_work_dirs(
            args, batch_size, current_dir, voxel_model, z)

        # Create torch device for GPU computing
        is_cuda = (torch.cuda.is_available())
        device = torch.device("cuda" if is_cuda else "cpu")

        # Dataset loading
        test_loader, train_loader, vis_train_loader = generate_datasets(
            DATASET_PATH, batch_size)

        # Tensorboard initializing

        logger = SummaryWriter(
            os.path.join(
                OUT_DIR,
                datetime.datetime.now().strftime('logs_%Y_%m_%d_%H_%M_%S' +
                                                 model_name[5:-3])))

        # Model and tr
        checkpoint_io, epoch_it, it, trainer = load_trainer_from_model(
            OUT_DIR, device, model_name, z)

        train_loop(model_name,
                   checkpoint_io,
                   test_loader,
                   train_loader,
                   trainer,
                   vis_train_loader,
                   logger,
                   error=error,
                   max_iterations=max_iterations,
                   epoch_it=epoch_it,
                   it=it,
                   checkpoint_every=500,
                   eval_network=validate,
                   pears=validate,
                   vis=validate)
Beispiel #2
0
def main(argv=None):
    data.maybe_download_and_extract(FLAGS.data_dir)
    # If cluster configuration flags were provided, save them
    if FLAGS.ps_hosts != '' and FLAGS.worker_hosts != '':
        ps_hosts = FLAGS.ps_hosts.split(",")
        worker_hosts = FLAGS.worker_hosts.split(",")
        cluster_config = {"ps": ps_hosts, "worker": worker_hosts}
        # Save cluster configuration
        with open('cluster.json', 'w') as f:
            json.dump(cluster_config, f)
        print('Cluster configuration saved.')
    else:
        try:
            # Read cluster configuration
            with open('cluster.json', 'r') as f:
                cluster_config = json.load(f)
        except (OSError, IOError) as e:
            print("No cluster configuration found: you need to provide at " \
                  "least once the two lists of ps and worker hosts")
            return

    if FLAGS.job_name == '':
        print('Pass this script a job name (ps or worker) to start a ' \
              'training session.')
        return

    # Create a cluster
    cluster = tf.train.ClusterSpec(cluster_config)

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        train.train_loop(cluster=cluster,
                         master=server.target,
                         task_index=FLAGS.task_index)
Beispiel #3
0
def main():
    t = time()
    conf_code = extract_config_code()
    check_flags()
    print(get_model_info_as_str())
    
    data = SiameseModelData(FLAGS.dataset_train)
    dist_sim_calculator = DistSimCalculator(FLAGS.dataset_train, FLAGS.ds_metric, FLAGS.ds_algo)
    model = create_model(FLAGS.model, data.input_dim(), data, dist_sim_calculator)
    os.environ["CUDA_VISIBLE_DEVICES"] = str(FLAGS.gpu)
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.compat.v1.Session(config=config)
    saver = Saver(sess)
    sess.run(tf.compat.v1.global_variables_initializer())
    train_costs, train_times = train_loop(data, model, saver, sess)
    test(data, model, saver, sess)
    saver.save_conf_code(conf_code)
    overall_time = convert_long_time_to_str(time() - t)
    
    print(overall_time, saver.get_log_dir())    
    saver.save_overall_time(overall_time)
    
    return train_costs, train_times
Beispiel #4
0
from utils import plot_samples
from train import train_loop
from test import test_loop
import torch.optim as optim
import torch.nn as nn



#model = Model7()
model = ResNet18()
show_model_summary(model.to(DEVICE), (3, 32, 32))

# Constants, put in config
epochs = 50
cuda_batch_size=128
cpu_batch_size = 4
num_workers = 4

# ToDo: Create separate transforms for train and test...
#transforms = model7_transforms()
(train_loader, test_loader, classes) = load_cifar10(model9_resnet_train_transforms(), model9_resnet_test_transforms(),
                             cuda_batch_size, cpu_batch_size, num_workers)

plot_samples(train_loader)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.009, momentum=0.9)

train_loop(epochs, train_loader, model, DEVICE, optimizer, criterion, None, False)
test_loop(test_loader, model, DEVICE, criterion)
Beispiel #5
0
        outpath = args.load_modelPath
        if torch.cuda.is_available():
            encoder.load_state_dict(torch.load(f'{outpath}/weights_encoder/epoch_{args.load_epoch}_encoder_weights.pth'))
            decoder.load_state_dict(torch.load(f'{outpath}/weights_decoder/epoch_{args.load_epoch}_decoder_weights.pth'))
        else:
            encoder.load_state_dict(torch.load(f'{outpath}/weights_encoder/epoch_{args.load_epoch}_encoder_weights.pth', map_location=torch.device('cpu')))
            encoder.load_state_dict(torch.load(f'{outpath}/weights_decoder/epoch_{args.load_epoch}_decoder_weights.pth', map_location=torch.device('cpu')))
    # Create new model
    else:
        outpath = f"{args.save_dir}/{gen_fname(args)}"

    if args.customSuffix is not None:
        outpath = f"{outpath}_{args.customSuffix}"

    make_dir(outpath)
    with open(f"{outpath}/args_cache.json", "w") as f:
        json.dump(vars(args), f)

    # Training
    optimizer_encoder = torch.optim.Adam(encoder.parameters(), args.lr)
    optimizer_decoder = torch.optim.Adam(decoder.parameters(), args.lr)
    train_avg_losses, valid_avg_losses, train_dts, valid_dts = train_loop(args, encoder, decoder, train_loader, valid_loader,
                                                                          optimizer_encoder, optimizer_decoder, outpath, device=device)

    '''Plotting evaluation results'''
    plot_eval_results(args, data=(train_avg_losses, valid_avg_losses), data_name="Losses", outpath=outpath)
    plot_eval_results(args, data=(train_dts, valid_dts), data_name="Time durations", outpath=outpath)
    plot_eval_results(args, data=[train_dts[i] + valid_dts[i] for i in range(len(train_dts))], data_name="Total time durations", outpath=outpath)

    print("Completed!")
Beispiel #6
0
                    help='Silence print statements during training')
parser.add_argument('--test',
                    action='store_true',
                    help='Just render the env, no training')

if __name__ == '__main__':
    args = parser.parse_args()
    assert args.n % 2 == 0
    if args.small_net and args.env_name not in [
            'CartPole-v0', 'CartPole-v1', 'MountainCar-v0'
    ]:
        args.env_name = 'CartPole-v1'
        print('Switching env to CartPole')

    env = create_atari_env(args.env_name)
    chkpt_dir = 'checkpoints/%s/' % args.env_name
    if not os.path.exists(chkpt_dir):
        os.makedirs(chkpt_dir)
    synced_model = ES(env.observation_space.shape[0], env.action_space,
                      args.small_net)
    for param in synced_model.parameters():
        param.requires_grad = False
    if args.restore:
        state_dict = torch.load(args.restore)
        synced_model.load_state_dict(state_dict)

    if args.test:
        render_env(args, synced_model, env)
    else:
        train_loop(args, synced_model, env, chkpt_dir)
def run_all_train_loops(env_name, cur_models, num_env_steps, device):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Train loop params
    num_updates = int(num_env_steps) // num_steps // num_processes
    lr_decay_horizon = int(10e6) // num_steps // num_processes
    log_interval = 10
    eval_interval = 10
    time_limit = gym.make(
        env_name).spec.tags['wrapper_config.TimeLimit.max_episode_steps']

    # All methods to be tested
    ppo_variants = ['PPO', 'PPONormObs', 'PPONormObsRew']
    all_methods = list(cur_models.keys()) + ppo_variants

    n_launches = 5
    path = './'
    for name in all_methods:
        np.random.seed(42)
        torch.manual_seed(42)

        if not os.path.exists(os.path.join(path, env_name, name)):
            os.makedirs(os.path.join(path, env_name, name))

        for i in range(n_launches):
            if name == 'PPONormObs':
                envs = make_vec_envs(env_name,
                                     1,
                                     num_processes,
                                     None,
                                     device,
                                     False,
                                     normalize_obs=True)
            elif name == 'PPONormObsRew':
                envs = make_vec_envs(env_name,
                                     1,
                                     num_processes,
                                     gamma,
                                     device,
                                     False,
                                     normalize_obs=True)
            else:
                envs = make_vec_envs(env_name,
                                     1,
                                     num_processes,
                                     gamma,
                                     device,
                                     False,
                                     normalize_obs=False)

            if name in ['ForwardDynLoss', 'InverseDynLoss', 'ICM', 'RND']:
                if name == 'RND':
                    cur_model = cur_models[name][0](
                        envs.observation_space.shape[0], num_processes)
                    cur_model.to(device)
                    cur_model.init_obs_norm(
                        random_observations(env_name, size=2000,
                                            device=device))
                else:
                    cur_model = cur_models[name][0](
                        envs.observation_space.shape[0], envs.action_space)
                    cur_model.to(device)
                curiosity_module = CuriosityModule(
                    cur_model, rew_coef=cur_models[name][1])
            else:
                curiosity_module = None

            print('Environment: {}, method: {}, {}'.format(env_name, name, i))
            actor_critic = Policy(envs.observation_space.shape,
                                  envs.action_space,
                                  base_kwargs={
                                      'recurrent': False
                                  }).to(device)

            agent = PPO(actor_critic, clip_param, ppo_epochs, num_mini_batch,
                        value_loss_coef, entropy_coef, lr, eps, max_grad_norm)

            stats = train_loop(agent,
                               envs,
                               env_name,
                               num_updates,
                               num_steps,
                               curiosity_module=curiosity_module,
                               save_interval=save_interval,
                               eval_interval=eval_interval,
                               log_interval=log_interval,
                               time_limit=time_limit,
                               curiosity_rew_after=0,
                               curiosity_rew_before=None,
                               use_linear_lr_decay=True,
                               lr_decay_horizon=lr_decay_horizon,
                               callbacks=None)
            with open(os.path.join(path, env_name, name, str(i)), 'wb') as f:
                pickle.dump(stats, f)
Beispiel #8
0
from transforms import model9_resnet_train_transforms, model9_resnet_test_transforms
from utils import plot_samples
from train import train_loop
from test import test_loop
import torch.optim as optim
import torch.nn as nn

#model = Model7()
model = ResNet18()
show_model_summary(model.to(DEVICE), (3, 32, 32))

# Constants, put in config
epochs = 50
cuda_batch_size = 128
cpu_batch_size = 4
num_workers = 4

# ToDo: Create separate transforms for train and test...
#transforms = model7_transforms()
(train_loader, test_loader, classes) = \
    dataloaders.load_cifar10(model9_resnet_train_transforms(), model9_resnet_test_transforms(),
                             cuda_batch_size, cpu_batch_size, num_workers)

plot_samples(train_loader)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.009, momentum=0.9)

train_loop(epochs, train_loader, model, DEVICE, optimizer, criterion)
test_loop(test_loader, model, DEVICE, criterion)
Beispiel #9
0
    # Similar to our train script, but we do this k times
    for k, datasets in enumerate(iterate_folds(fold_sets)):
        train, val, test = datasets
        model = BeatNet(downbeats=args.downbeats)
        if cuda_device is not None:
            model.cuda(args.cuda_device)

        output_file = make_fold_output_name(args.output_file, k)

        train_loader, val_loader, test_loader = make_data_loaders(
            (train, val, test), batch_size=args.batch_size)

        train_loop(model,
                   train_loader,
                   val_loader=val_loader,
                   num_epochs=args.num_epochs,
                   cuda_device=cuda_device,
                   output_file=output_file,
                   davies_stopping_condition=args.davies_stopping_condition,
                   fold=k)

        if args.output_file is not None:
            save_model(model, output_file)

        if args.dataset_output_file is not None:
            save_dir = make_fold_output_name(args.dataset_output_file, k)
            save_datasets((train, val, test), save_dir)

        test_model(model, test_loader, cuda_device=cuda_device)
Beispiel #10
0
    # instantiate model (and restore if needed)
    synced_model = ES(env.observation_space,
                      env.action_space,
                      use_a3c_net=args.a3c_net,
                      use_virtual_batch_norm=args.virtual_batch_norm)
    for param in synced_model.parameters():
        param.requires_grad = False
    if args.restore:
        state_dict = torch.load(args.restore)
        synced_model.load_state_dict(state_dict)

    # compute batch for virtual batch normalization
    if args.virtual_batch_norm and not args.test:
        # print('Computing batch for virtual batch normalization')
        virtual_batch = gather_for_virtual_batch_norm(
            env, batch_size=args.virtual_batch_norm)
        virtual_batch = torchify(virtual_batch, unsqueeze=False)
    else:
        virtual_batch = None

    # train or test as requested
    if args.test:
        render_env(args, synced_model, env)
    else:
        train_loop(args,
                   synced_model,
                   env,
                   chkpt_dir,
                   virtual_batch=virtual_batch)
Beispiel #11
0
                                                  max_to_keep=MAX_CKPT_TO_SAVE)

        # if a checkpoint exists, restore the latest checkpoint.
        if ckpt_manager.latest_checkpoint:
            ckpt.restore(ckpt_manager.latest_checkpoint)
            print('Latest checkpoint restored: {}!!'.format(
                ckpt_manager.latest_checkpoint))

        train_loop(train_x,
                   train_y,
                   test_x,
                   test_y,
                   generator_g,
                   generator_f,
                   discriminator_x,
                   discriminator_y,
                   generator_g_optimizer,
                   generator_f_optimizer,
                   discriminator_x_optimizer,
                   discriminator_y_optimizer,
                   ckpt_manager,
                   batch_size=batch_size,
                   epochs=EPOCHS,
                   num_epochs_to_save=NUM_EPOCHS_TO_SAVE)

    elif mode == 'predict':
        if checkpoint_path == None:
            exit('Error: Please specify checkpoint path')
        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=MAX_CKPT_TO_SAVE)
        if not ckpt_manager.latest_checkpoint: