Example #1
0
        algorithm.to(ptu.device)
    algorithm.train()

    return 1


if __name__ == '__main__':
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--experiment', help='experiment specification file')
    parser.add_argument('-g', '--gpu', help='gpu id', type=int, default=0)
    args = parser.parse_args()
    with open(args.experiment, 'r') as spec_file:
        spec_string = spec_file.read()
        exp_specs = yaml.load(spec_string)

    # make all seeds the same.
    exp_specs['env_specs']['eval_env_seed'] = exp_specs['env_specs']['training_env_seed'] = exp_specs['seed']

    if exp_specs['num_gpu_per_worker'] > 0:
        print('\n\nUSING GPU\n\n')
        ptu.set_gpu_mode(True, args.gpu)
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    exp_prefix = exp_prefix + '--sigma-{}'.format(exp_specs['sigma'])
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs, seed=seed, snapshot_mode="all")

    experiment(exp_specs)
Example #2
0
def run_experiment_here(
        experiment_function,
        variant=None,
        exp_id=0,
        seed=None,
        use_gpu=True,
        # Logger params:
        exp_prefix="default",
        snapshot_mode='last',
        snapshot_gap=1,
        git_infos=None,
        script_name=None,
        base_log_dir=None,
        force_randomize_seed=False,
        log_dir=None,
        **setup_logger_kwargs
):
    """
    Run an experiment locally without any serialization.

    :param experiment_function: Function. `variant` will be passed in as its
    only argument.
    :param exp_prefix: Experiment prefix for the save file.
    :param variant: Dictionary passed in to `experiment_function`.
    :param exp_id: Experiment ID. Should be unique across all
    experiments. Note that one experiment may correspond to multiple seeds,.
    :param seed: Seed used for this experiment.
    :param use_gpu: Run with GPU. By default False.
    :param script_name: Name of the running script
    :param log_dir: If set, set the log directory to this. Otherwise,
    the directory will be auto-generated based on the exp_prefix.
    :return:
    """
    if variant is None:
        variant = {}
    variant['exp_id'] = str(exp_id)

    if force_randomize_seed or seed is None:
        seed = random.randint(0, 100000)
        variant['seed'] = str(seed)
    reset_execution_environment()

    actual_log_dir = setup_logger(
        exp_prefix=exp_prefix,
        variant=variant,
        exp_id=exp_id,
        seed=seed,
        snapshot_mode=snapshot_mode,
        snapshot_gap=snapshot_gap,
        base_log_dir=base_log_dir,
        log_dir=log_dir,
        git_infos=git_infos,
        script_name=script_name,
        **setup_logger_kwargs
    )

    set_seed(seed)
    set_gpu_mode(use_gpu)

    run_experiment_here_kwargs = dict(
        variant=variant,
        exp_id=exp_id,
        seed=seed,
        use_gpu=use_gpu,
        exp_prefix=exp_prefix,
        snapshot_mode=snapshot_mode,
        snapshot_gap=snapshot_gap,
        git_infos=git_infos,
        script_name=script_name,
        base_log_dir=base_log_dir,
        **setup_logger_kwargs
    )
    save_experiment_data(
        dict(
            run_experiment_here_kwargs=run_experiment_here_kwargs
        ),
        actual_log_dir
    )
    return experiment_function(variant)
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1


if __name__ == '__main__':
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--experiment',
                        help='experiment specification file')
    args = parser.parse_args()
    with open(args.experiment, 'r') as spec_file:
        spec_string = spec_file.read()
        exp_specs = yaml.load(spec_string)

    if exp_specs['use_gpu']:
        print('\n\nUSING GPU\n\n')
        ptu.set_gpu_mode(True)
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    experiment(exp_specs)
Example #4
0
def simulate_policy(args):
    data = joblib.load(args.file)

    cont = False

    if 'policies' in data:
        policy = data['policies'][0]
    else:
        policy = data['policy']
    env = NormalizedBoxEnv(create_swingup())  #data['env']

    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
        data['qf1'].cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)

    diayn = 'df' in data
    rnd = 'rf' in data

    if diayn:
        skills = len(data['eval_policy'].skill_vec)
        disc = data['df']

        policy = OptionPolicy(policy, skills, cont)
        if args.gpu:
            disc.cuda()
        if isinstance(policy, PyTorchModule):
            disc.train(False)

    if rnd:
        data['rf'].cuda()
        data['pf'].cuda()
        data['qf1'].cuda()

    import cv2
    video = cv2.VideoWriter('video.avi', cv2.VideoWriter_fourcc(*"H264"), 30,
                            (640, 480))
    index = 0

    truth, pred = [], []

    if cont:
        eps = 1
    elif diayn:
        eps = skills * 2
    else:
        eps = 5

    Rs = []

    for ep in range(eps):
        if diayn and not cont:
            z_index = ep // 2
            policy.set_z(z_index)

        path = rollout(
            env,
            policy,
            max_path_length=args.H * skills if cont else args.H,
            animated=True,
        )

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()

        total_r = 0

        if diayn:
            predictions = F.log_softmax(
                disc(torch.FloatTensor(path['observations']).cuda()),
                1).cpu().detach().numpy()
            probs = predictions.max(1)
            labels = predictions.argmax(1)

            if cont:
                for k in range(skills):
                    truth.extend([k] * 100)
            else:
                truth.extend([z_index] * len(labels))
            pred.extend(labels.tolist())

        if rnd:
            random_feats = data['rf'](torch.FloatTensor(
                path['observations']).cuda())
            pred_feats = data['pf'](torch.FloatTensor(
                path['observations']).cuda())

            i_rewards = ((random_feats -
                          pred_feats)**2.0).sum(1).cpu().data.numpy()

        q_pred = data['qf1'](torch.FloatTensor(path['observations']).cuda(),
                             torch.FloatTensor(
                                 path['actions']).cuda()).cpu().data.numpy()

        for i, (img, r, s) in enumerate(
                zip(path['images'], path['rewards'], path['observations'])):
            #video.write(img[:,:,::-1].astype(np.uint8))
            total_r += r[0]
            img = img.copy()
            img = np.rot90(img, 3).copy()
            col = (255, 0, 255)
            cv2.putText(img, "step: %d" % (i + 1), (20, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)

            if diayn:
                if cont:
                    cv2.putText(img, "z: %s" % str(truth[i]), (20, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255),
                                2, cv2.LINE_AA)
                else:
                    cv2.putText(img, "z: %s" % str(z_index), (20, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255),
                                2, cv2.LINE_AA)

                cv2.putText(img,
                            "disc_pred: %s (%.3f)" % (labels[i], probs[i]),
                            (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.0,
                            (255, 255, 255), 2, cv2.LINE_AA)
                cv2.putText(img, "reward: %.3f" % r[0], (20, 160),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                            cv2.LINE_AA)
                cv2.putText(img, "total reward: %.1f" % total_r, (20, 200),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                            cv2.LINE_AA)
                cv2.putText(img, "action: %s" % path['actions'][i], (20, 240),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
                            cv2.LINE_AA)
            else:
                cv2.putText(img, "reward: %.1f" % r[0], (20, 80),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
                cv2.putText(img, "total reward: %.1f" % total_r, (20, 120),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
                y = 120

            if rnd:
                cv2.putText(img, "i reward (unscaled): %.3f" % i_rewards[i],
                            (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2,
                            cv2.LINE_AA)
                #cv2.rectangle(img, (20, 180), (20 + int(q_pred[i, 0]), 200), (255, 0, 255), -1)
                cv2.rectangle(img, (20, 200),
                              (20 + int(i_rewards[i] * 10), 220),
                              (255, 255, 0), -1)
                y = 220

            try:
                y += 40
                cv2.putText(img, "Q: %.3f" % q_pred[i], (20, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
            except:
                y += 40
                cv2.putText(img, "Q:" + str([q for q in q_pred[i]]), (20, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)
            y += 40
            cv2.putText(img, str(["%.3f" % x
                                  for x in path['observations'][i]]), (20, y),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA)

            try:
                cv2.imwrite("frames/%06d.png" % index, img[:, :, ::-1])
            except:
                cv2.imwrite("frames/%06d.png" % index, img[:, :])
            index += 1

        if diayn:
            print(z_index, ":", total_r)
        Rs.append(total_r)

    print("best", np.argmax(Rs))
    print("worst", np.argmin(Rs))

    video.release()
    print("wrote video")

    if diayn:
        import sklearn
        from sklearn.metrics import confusion_matrix
        import matplotlib as mpl
        import itertools
        mpl.use('Agg')
        import matplotlib.pyplot as plt
        normalize = False
        classes = range(skills)
        cm = confusion_matrix(truth, pred)
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.colorbar()
        tick_marks = np.arange(skills)
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        """
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        """

        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig("confusion.png")
Example #5
0
def experiment(variant):
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)
    log_dir = os.path.expanduser(variant["log_dir"])
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    # missing - set torch seed and num threads=1

    # expl_env = gym.make(variant["env_name"])
    expl_envs = make_vec_envs(
        variant["env_name"],
        variant["seed"],
        variant["num_processes"],
        variant["gamma"],
        variant["log_dir"],  # probably change this?
        ptu.device,
        False,
        pytorch=False,
    )
    # eval_env = gym.make(variant["env_name"])
    eval_envs = make_vec_envs(
        variant["env_name"],
        variant["seed"],
        variant["num_processes"],
        1,
        variant["log_dir"],
        ptu.device,
        False,
        pytorch=False,
    )
    obs_shape = expl_envs.observation_space.image.shape
    # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:  # convert WxHxC into CxWxH
    #     expl_env = TransposeImage(expl_env, op=[2, 0, 1])
    #     eval_env = TransposeImage(eval_env, op=[2, 0, 1])
    # obs_shape = expl_env.observation_space.shape

    channels, obs_width, obs_height = obs_shape
    action_space = expl_envs.action_space

    base_kwargs = {"num_inputs": channels, "recurrent": variant["recurrent_policy"]}

    base = CNNBase(**base_kwargs)

    dist = create_output_distribution(action_space, base.output_size)

    eval_policy = LearnPlanPolicy(
        WrappedPolicy(
            obs_shape,
            action_space,
            ptu.device,
            base=base,
            deterministic=True,
            dist=dist,
            num_processes=variant["num_processes"],
        ),
        num_processes=variant["num_processes"],
        vectorised=True,
    )
    expl_policy = LearnPlanPolicy(
        WrappedPolicy(
            obs_shape,
            action_space,
            ptu.device,
            base=base,
            deterministic=False,
            dist=dist,
            num_processes=variant["num_processes"],
        ),
        num_processes=variant["num_processes"],
        vectorised=True,
    )

    # missing: at this stage, policy hasn't been sent to device, but happens later
    eval_path_collector = HierarchicalStepCollector(
        eval_envs,
        eval_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["algorithm_kwargs"][
            "num_eval_steps_per_epoch"
        ],
        num_processes=variant["num_processes"],
        render=variant["render"],
    )
    expl_path_collector = HierarchicalStepCollector(
        expl_envs,
        expl_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["num_steps"],
        num_processes=variant["num_processes"],
        render=variant["render"],
    )
    # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step]

    trainer = A2CTrainer(actor_critic=expl_policy.learner, **variant["trainer_kwargs"])
    # missing: by this point, rollout back in sync.
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs)
    # added: replay buffer is new
    algorithm = TorchIkostrikovRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_envs,
        evaluation_env=eval_envs,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"],
        # batch_size,
        # max_path_length,
        # num_epochs,
        # num_eval_steps_per_epoch,
        # num_expl_steps_per_train_loop,
        # num_trains_per_train_loop,
        # num_train_loops_per_epoch=1,
        # min_num_steps_before_training=0,
    )

    algorithm.to(ptu.device)
    # missing: device back in sync
    algorithm.train()
Example #6
0
def simulate_policy(args):
    # hyper-parameters
    fov, delta, num_ch = 33, 8, 3
    rad = fov // 2
    data = torch.load(args.file)
    color = [240, 128, 128]

    # load policy & env
    policy = data['evaluation/policy']
    env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    policy.reset()  # does noting

    # load image
    img_volume = skio.imread('data/brainbow/training_sample_1.tif',
                             plugin='tifffile')
    img_volume_copy = np.copy(img_volume)
    img_volume = img_volume.astype(np.float32)
    img_volume = (img_volume - 128) / 33

    # select specific z slice
    s_z = 54
    img_plane = img_volume[s_z].astype(np.float32)
    img_plane_copy = img_volume_copy[s_z]
    img_plane_shape = img_plane.shape
    '''
    # gather random starting point for each color
    # random starting point should be colored and also not FoV boundary
    s_point_list = []  # list of starting point
    unique_colors = np.unique(np.reshape(img_plane, (-1, 3)), axis=0)
    for color in unique_colors:
        if np.all(color == (-128. / 33)):
            continue
        color_index_list_tmp = np.argwhere(np.all(img_plane == color, axis=2))
        #print(color_index_list_tmp)
        color_index_list = []
        for index in color_index_list_tmp:
            if is_fov_boundary(img_plane_shape, rad, index):
                color_index_list.append(index)
        #print(color_index_list)
        #print(type(color_index_list))
        len_color_index_list = len(color_index_list)
        if len_color_index_list > 0:
            random_index = np.random.choice(len_color_index_list, 1)
            random_start_point = color_index_list[random_index[0]]
            #print(random_start_point)
            s_point_list.append(random_start_point)
    # print(s_point_list)
    '''

    coord = np.argwhere(np.any(img_plane_copy > 100, axis=2))
    coord = coord[np.all(coord >= rad, axis=1) &
                  (coord[:, 0] < img_plane_shape[0] - rad) &
                  (coord[:, 1] < img_plane_shape[1] - rad)]  # remove bo
    np.random.shuffle(coord)
    s_point_list = coord

    # start skeletonization
    for i, s_point in enumerate(s_point_list):
        print('Skeletoninzing', i + 1, 'th point:', s_point)
        # initialization
        img_plane_tmp = np.copy(img_plane_copy)
        s_y, s_x = s_point
        Q = deque([[s_y, s_x]])
        V = [[s_y, s_x]]

        # start skeletonization for some starting point
        while len(Q) > 0:
            c_y, c_x = Q.popleft()  # current y, x

            cur_p_t = img_plane[c_y - rad:c_y + rad + 1,
                                c_x - rad:c_x + rad + 1]  # current patch top
            cur_p_l = cv2.rotate(cur_p_t,
                                 cv2.ROTATE_90_CLOCKWISE)  # current patch left
            cur_p_r = cv2.rotate(
                cur_p_t, cv2.ROTATE_90_COUNTERCLOCKWISE)  # current patch right
            cur_p_b = cv2.rotate(cur_p_t,
                                 cv2.ROTATE_180)  # current patch bottom

            a_t, _ = policy.get_action(np.moveaxis(cur_p_t, -1,
                                                   0).flatten())  # move top
            a_l, _ = policy.get_action(np.moveaxis(cur_p_l, -1,
                                                   0).flatten())  # move left
            a_r, _ = policy.get_action(np.moveaxis(cur_p_r, -1,
                                                   0).flatten())  # move right
            a_b, _ = policy.get_action(np.moveaxis(cur_p_b, -1,
                                                   0).flatten())  # move bottom

            top = [c_y - delta, c_x]
            left = [c_y, c_x - delta]
            right = [c_y, c_x + delta]
            bottom = [c_y + delta, c_x]

            if a_t == 1:
                if top not in V and is_fov_boundary(img_plane_shape, rad, top):
                    img_plane_tmp[c_y - delta:c_y + 1, c_x] = color
                    Q.append(top)
                    V.append(top)
            if a_l == 1:
                if left not in V and is_fov_boundary(img_plane_shape, rad,
                                                     left):
                    img_plane_tmp[c_y, c_x - delta:c_x + 1] = color
                    Q.append(left)
                    V.append(left)
            if a_r == 1:
                if right not in V and is_fov_boundary(img_plane_shape, rad,
                                                      right):
                    img_plane_tmp[c_y, c_x:c_x + delta + 1] = color
                    Q.append(right)
                    V.append(right)
            if a_b == 1:
                if bottom not in V and is_fov_boundary(img_plane_shape, rad,
                                                       bottom):
                    img_plane_tmp[c_y:c_y + delta + 1, c_x] = color
                    Q.append(bottom)
                    V.append(bottom)

        # plot final result
        img_plane_tmp[s_y - 1:s_y + 2,
                      s_x - 1:s_x + 2] = [252, 255, 51]  # color starting point
        fig = plt.figure(figsize=(10, 10))
        plt.imshow(img_plane_tmp)
        plt.show()
        plt.close()
Example #7
0
def main(
    env_name,
    exp_dir,
    seed,
    resume,
    mode,
    archi,
    epochs,
    reward_scale,
    hidden_dim,
    batch_size,
    learning_rate,
    n_layers,
    soft_target_tau,
    auto_alpha,
    alpha,
    frac_goal_replay,
    horizon,
    replay_buffer_size,
    snapshot_mode,
    snapshot_gap,
    cpu,
):
    valid_modes = ["vanilla", "her"]
    valid_archi = [
        "mlp",
        "cnn",
        "pointnet",
    ]
    if mode not in valid_modes:
        raise ValueError(f"Unknown mode: {mode}")
    if archi not in valid_archi:
        raise ValueError(f"Unknown network archi: {archi}")

    machine_log_dir = settings.log_dir()
    exp_dir = os.path.join(machine_log_dir, exp_dir, f"seed{seed}")
    # multi-gpu and batch size scaling
    replay_buffer_size = replay_buffer_size
    num_expl_steps_per_train_loop = 1000
    num_eval_steps_per_epoch = 1000
    min_num_steps_before_training = 1000
    num_trains_per_train_loop = 1000
    # learning rate and soft update linear scaling
    policy_lr = learning_rate
    qf_lr = learning_rate
    variant = dict(
        env_name=env_name,
        algorithm="sac",
        version="normal",
        seed=seed,
        resume=resume,
        mode=mode,
        archi=archi,
        replay_buffer_kwargs=dict(max_replay_buffer_size=replay_buffer_size,),
        algorithm_kwargs=dict(
            batch_size=batch_size,
            num_epochs=epochs,
            num_eval_steps_per_epoch=num_eval_steps_per_epoch,
            num_expl_steps_per_train_loop=num_expl_steps_per_train_loop,
            num_trains_per_train_loop=num_trains_per_train_loop,
            min_num_steps_before_training=min_num_steps_before_training,
            max_path_length=horizon,
        ),
        trainer_kwargs=dict(
            discount=0.99,
            soft_target_tau=soft_target_tau,
            target_update_period=1,
            policy_lr=policy_lr,
            qf_lr=qf_lr,
            reward_scale=reward_scale,
            use_automatic_entropy_tuning=auto_alpha,
            alpha=alpha,
        ),
        qf_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers),
        policy_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers),
        log_dir=exp_dir,
    )
    if mode == "her":
        variant["replay_buffer_kwargs"].update(
            dict(
                fraction_goals_rollout_goals=1
                - frac_goal_replay,  # equal to k = 4 in HER paper
                fraction_goals_env_goals=0,
            )
        )
    set_seed(seed)

    setup_logger_kwargs = {
        "exp_prefix": exp_dir,
        "variant": variant,
        "log_dir": exp_dir,
        "snapshot_mode": snapshot_mode,
        "snapshot_gap": snapshot_gap,
    }
    setup_logger(**setup_logger_kwargs)
    ptu.set_gpu_mode(not cpu, distributed_mode=False)
    print(f"Start training...")
    sac(variant)
Example #8
0
            exp_dir = '{}_kl'.format(exp_dir)
            variant["KL"] = True

        else:
            # use bonus as KL: -\beta * b
            exp_dir = '{0}_{1:.2g}'.format(exp_dir, args.beta)

    else:
        exp_dir = '{}/offline/{}_{}'.format(args.env, timestamp, args.seed)

    # setup the logger
    print('experiment dir:logs/{}'.format(exp_dir))
    setup_logger(variant=variant, log_dir='logs/{}'.format(exp_dir))

    # cuda setup
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    if use_cuda:
        # optionally set the GPU (default=False)
        ptu.set_gpu_mode(True, gpu_id=args.device_id)
        print('using gpu:{}'.format(args.device_id))

        def map_location(storage, loc):
            return storage.cuda()

    else:
        map_location = 'cpu'
        ptu.set_gpu_mode(False)  # optionally set the GPU (default=False)

    experiment(variant)
Example #9
0
def simulate_policy(args):
    if args.pause:
        import ipdb; ipdb.set_trace()
    data = pickle.load(open(args.file, "rb")) # joblib.load(args.file)
    if 'policy' in data:
        policy = data['policy']
    elif 'evaluation/policy' in data:
        policy = data['evaluation/policy']
    else:
        policy = data['evaluation/hard_init/policy']

    if 'env' in data:
        env = data['env']
    elif 'evaluation/env' in data:
        env = data['evaluation/env']
    else:
        env = data['evaluation/hard_init/env']

    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")
    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.to(ptu.device)
    else:
        ptu.set_gpu_mode(False)
        policy.to(ptu.device)
    if isinstance(env, VAEWrappedEnv):
        env.mode(args.mode)
    if args.enable_render or hasattr(env, 'enable_render'):
        # some environments need to be reconfigured for visualization
        env.enable_render()
    if args.multitaskpause:
        env.pause_on_goal = True
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    paths = []
    import torch
    def check(net):
        for name, param in net.named_parameters():
            if torch.isnan(param).any():
                print(name)
    qf = data['trainer/qf1']
    # import ipdb; ipdb.set_trace()
    observation_key = data.get('evaluation/observation_key', 'observation')
    context_keys = data.get('evaluation/context_keys_for_policy', ['context'])
    context_keys = data.get('evaluation/hard_init/context_keys_for_policy')

    while True:
        paths.append(contextual_rollout(
            env,
            policy,
            max_path_length=args.H,
            render=not args.hide,
            observation_key=observation_key,
            context_keys_for_policy=context_keys,
            # context_keys_for_policy=['state_desired_goal'],
        ))
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics(paths)
        if hasattr(env, "get_diagnostics"):
            for k, v in env.get_diagnostics(paths).items():
                logger.record_tabular(k, v)
        logger.dump_tabular()
Example #10
0
def run_experiment():
    # Define agent-specific arguments
    trainer_kwargs = None
    if args.agent == "SAC":
        trainer_kwargs = dict(
            discount=args.gamma,
            soft_target_tau=args.soft_target_tau,
            target_update_period=args.target_update_period,
            policy_lr=args.policy_lr,
            qf_lr=args.qf_lr,
            reward_scale=args.reward_scale,
            use_automatic_entropy_tuning=(not args.no_auto_entropy_tuning),
        )
    elif args.agent == "TD3":
        trainer_kwargs = dict(
            target_policy_noise=args.target_policy_noise,
            discount=0.99,
            reward_scale=args.reward_scale,
            policy_learning_rate=args.policy_lr,
            qf_learning_rate=args.qf_lr,
            policy_and_target_update_period=args.
            policy_and_target_update_period,
            tau=args.tau,
        )
    else:
        pass

    # Set random seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Directory to place data
    THIS_DIR = os.path.dirname(
        args.variant)  # os.path.dirname(os.path.abspath(__file__))

    # Construct variant to train
    if args.variant is None:
        variant = dict(
            algorithm=args.agent,
            seed=args.seed,
            version="normal",
            replay_buffer_size=int(1E6),
            qf_kwargs=dict(hidden_sizes=args.qf_hidden_sizes, ),
            policy_kwargs=dict(hidden_sizes=args.policy_hidden_sizes, ),
            algorithm_kwargs=dict(
                num_epochs=args.n_epochs,
                num_eval_steps_per_epoch=args.eval_horizon * args.num_eval,
                num_trains_per_train_loop=args.trains_per_train_loop,
                num_expl_steps_per_train_loop=args.expl_horizon *
                args.expl_ep_per_train_loop,
                min_num_steps_before_training=args.steps_before_training,
                expl_max_path_length=args.expl_horizon,
                eval_max_path_length=args.eval_horizon,
                batch_size=args.batch_size,
            ),
            trainer_kwargs=trainer_kwargs,
            expl_environment_kwargs=get_expl_env_kwargs(args),
            eval_environment_kwargs=get_eval_env_kwargs(args),
        )
        # Set logging
        tmp_file_prefix = "{}_{}_{}_SEED{}".format(args.env,
                                                   "".join(args.robots),
                                                   args.controller, args.seed)
    else:
        # This is a variant we want to load
        # Attempt to load the json file
        try:
            with open(args.variant) as f:
                variant = json.load(f)
        except FileNotFoundError:
            print("Error opening specified variant json at: {}. "
                  "Please check filepath and try again.".format(variant))

        # Set logging
        tmp_file_prefix = "{}_{}_{}_SEED{}".format(
            variant["expl_environment_kwargs"]["env_name"],
            "".join(variant["expl_environment_kwargs"]["robots"]),
            variant["expl_environment_kwargs"]["controller"], args.seed)
        # Set agent
        args.agent = variant["algorithm"]

    # Setup logger
    abs_root_dir = os.path.join(THIS_DIR, args.log_dir)
    tmp_dir = setup_logger(tmp_file_prefix,
                           variant=variant,
                           base_log_dir=abs_root_dir)
    ptu.set_gpu_mode(
        torch.cuda.is_available())  # optionally set the GPU (default=False

    # Run experiment
    experiment(variant, agent=args.agent)
Example #11
0
def run_policy(file,
               eval_env,
               goal_env=False,
               use_color=True,
               cherrypick=False,
               fixed_length=False,
               verbose=False,
               render_kwargs=dict(height=128, width=128, camera_id=0)):

    ptu.set_gpu_mode(True, 0)

    with open(file, 'rb') as f:
        params = pickle.load(f)

    if goal_env:
        obs_dim = eval_env.observation_space.spaces['observation'].low.size
        action_dim = eval_env.action_space.low.size
        goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size
    else:
        obs_dim = eval_env.observation_space.low.size
        action_dim = eval_env.action_space.low.size

    policy = params['exploration/policy']  # .to(ptu.device)
    policy = policy.eval()
    policy = MakeDeterministic(policy)
    if goal_env:
        r = [-1]
        step = 0
        while 0 not in r or sum(r) == 0:
            step += 1
            start = time.time()
            if goal_env:
                path = multitask_rollout_visualizer(
                    eval_env,
                    agent=policy,
                    max_path_length=eval_env.max_steps,
                    render=True,
                    render_kwargs=render_kwargs,
                    observation_key='observation',
                    desired_goal_key='desired_goal',
                    get_action_kwargs=None,
                    return_dict_obs=True,
                    use_color=use_color,
                    fixed_length=fixed_length)

                r = path["rewards"]

            else:
                path = rollout_visualizer(eval_env,
                                          agent=policy,
                                          max_path_length=eval_env.max_steps,
                                          render=True,
                                          render_kwargs=render_kwargs,
                                          use_color=use_color)

                r = path["rewards"]
            if verbose:
                print(step, len(r), sum(r), end='\r')
            if not cherrypick:
                break

    return path, eval_env
def simulate_policy(args):
    # import torch
    # torch.manual_seed(6199)
    if args.pause:
        import ipdb
        ipdb.set_trace()
    data = pickle.load(open(args.file, "rb"))
    policy = data['algorithm'].policy

    num_blocks = 6
    stack_only = True

    # env = data['env']
    env = gym.make(
        F"FetchBlockConstruction_{num_blocks}Blocks_IncrementalReward_DictstateObs_42Rendersize_{stack_only}Stackonly_AllCase-v1"
    )

    env = Monitor(env,
                  force=True,
                  directory="videos/",
                  video_callable=lambda x: x)

    print("Policy and environment loaded")
    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.to(ptu.device)
    if args.enable_render or hasattr(env, 'enable_render'):
        # some environments need to be reconfigured for visualization
        env.enable_render()
    policy.train(False)
    failures = []
    successes = []
    for path_idx in range(100):
        path = multitask_rollout(
            env,
            policy,
            max_path_length=num_blocks * 50,
            animated=not args.hide,
            observation_key='observation',
            desired_goal_key='desired_goal',
            get_action_kwargs=dict(mask=np.ones((1, num_blocks)),
                                   deterministic=True),
        )

        if not is_solved(path, num_blocks):
            failures.append(path)
            print(F"Failed {path_idx}")
        else:
            print(F"Succeeded {path_idx}")
            successes.append(path)
        # if hasattr(env, "log_diagnostics"):
        #     env.log_diagnostics(paths)
        # if hasattr(env, "get_diagnostics"):
        #     for k, v in env.get_diagnostics(paths).items():
        #         logger.record_tabular(k, v)
        # logger.dump_tabular()
    print(f"Success rate {len(successes)/(len(successes) + len(failures))}")
    from rlkit.core.eval_util import get_generic_path_information
    path_info = get_generic_path_information(successes + failures,
                                             num_blocks=num_blocks)
    print(path_info)
Example #13
0
import numpy as np
from gym.envs.mujoco import HalfCheetahEnv

import rlkit.torch.pytorch_util as ptu
from rlkit.envs.wrappers import NormalizedBoxEnv
from rlkit.launchers.launcher_util import setup_logger
from rlkit.torch.sac.policies import TanhGaussianPolicy
from rlkit.torch.sac.sac import SoftActorCritic
from rlkit.torch.networks import FlattenMlp
import rlkit.torch.pytorch_util as U

from rlkit.envs.mujoco_manip_env import MujocoManipEnv

# Sets the GPU mode.
USE_GPU = True
U.set_gpu_mode(USE_GPU)

EXPERIMENT_NAME = "cans-50-50-reward-scale-1"
#EXPERIMENT_NAME = "pegs-50-50-reward-scale-0.1"
#EXPERIMENT_NAME = "lift-lr-1e-4"
HORIZON = 250
UPDATES_PER_STEP = 1
REWARD_SCALE = 1

# DEMO_PATH = None
DEMO_PATH = "/home/robot/Downloads/test_extraction/bins-Can0-sars.pkl"
MIX_DEMO = True

ACTION_SKIP = 1
LR = 3E-4
Example #14
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)
    img_save_path = 'junk_vis/debug_more_proper'

    # Prep the data -----------------------------------------------------------
    data_path = 'junk_vis/multi_mnist_data'
    canvas_size = 36
    (X_train, _), (X_test, _) = multi_mnist(data_path,
                                            max_digits=1,
                                            canvas_size=canvas_size,
                                            seed=42,
                                            use_max=True)
    X_train = X_train[:, None, ...]
    X_test = X_test[:, None, ...]
    X_train, X_test = torch.FloatTensor(X_train) / 255.0, torch.FloatTensor(
        X_test) / 255.0

    # np_imgs = np.load('/u/kamyar/dsprites-dataset/dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz')['imgs']

    # np_imgs = None

    X_train = torch.clamp(X_train, 0.05, 0.95)
    X_test = torch.clamp(X_test, 0.05, 0.95)
    train_ds = TensorDataset(X_train)
    val_ds = TensorDataset(X_test)

    # Model Definition --------------------------------------------------------
    if exp_specs['masked']:
        model = MaskedVAE(
            [1, canvas_size, canvas_size],
            exp_specs['vae_specs']['z_dim'],
            exp_specs['vae_specs']['encoder_specs'],
            exp_specs['vae_specs']['decoder_specs'],
        )
    else:
        model = VAE(
            [1, canvas_size, canvas_size],
            exp_specs['vae_specs']['z_dim'],
            exp_specs['vae_specs']['encoder_specs'],
            exp_specs['vae_specs']['decoder_specs'],
        )
    if ptu.gpu_enabled():
        model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    global_iter = 0
    for epoch in range(exp_specs['epochs']):
        train_loader = DataLoader(train_ds,
                                  batch_size=exp_specs['batch_size'],
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=True,
                                  drop_last=True)
        for iter_num, img_batch in enumerate(train_loader):
            img_batch = img_batch[0]
            if ptu.gpu_enabled(): img_batch = img_batch.cuda()

            z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model(
                img_batch)
            elbo, KL = model.compute_ELBO(z_mean,
                                          z_log_cov,
                                          recon_mean,
                                          recon_log_cov,
                                          img_batch,
                                          average_over_batch=True)
            loss = -1. * elbo
            loss.backward()
            model_optim.step()

            if global_iter % 1000 == 0:
                mse = ((recon_mean - img_batch)**2).mean()
                print('\nTraining Iter %d...' % global_iter)
                print('ELBO:\t%.4f' % elbo)
                print('MSE:\t%.4f' % mse)
                print('KL:\t%.4f' % KL)
                save_pytorch_tensor_as_img(
                    img_batch[0].data.cpu(),
                    os.path.join(img_save_path,
                                 '%d_train_img.png' % (global_iter)))
                save_pytorch_tensor_as_img(
                    recon_mean[0].data.cpu(),
                    os.path.join(img_save_path,
                                 '%d_train_recon.png' % (global_iter)))
                if exp_specs['masked']:
                    save_pytorch_tensor_as_img(
                        enc_mask[0].data.cpu(),
                        os.path.join(img_save_path,
                                     '%d_train_enc_mask.png' % (global_iter)))
                    # save_pytorch_tensor_as_img(dec_mask[0].data.cpu(), os.path.join(img_save_path, '%d_train_dec_mask.png'%(global_iter)))

            if global_iter % exp_specs['freq_val'] == 0:
                with torch.no_grad():
                    print('Validating Iter %d...' % global_iter)
                    model.eval()

                    idxs = np.random.choice(int(X_test.size(0)),
                                            size=exp_specs['batch_size'],
                                            replace=False)
                    img_batch = X_test[idxs]
                    if ptu.gpu_enabled(): img_batch = img_batch.cuda()

                    z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model(
                        img_batch)
                    elbo, KL = model.compute_ELBO(z_mean,
                                                  z_log_cov,
                                                  recon_mean,
                                                  recon_log_cov,
                                                  img_batch,
                                                  average_over_batch=True)
                    mse = ((recon_mean - img_batch)**2).mean()

                    print('ELBO:\t%.4f' % elbo)
                    print('MSE:\t%.4f' % mse)
                    print('KL:\t%.4f' % KL)

                    for i in range(1):
                        save_pytorch_tensor_as_img(
                            img_batch[i].data.cpu(),
                            os.path.join(img_save_path,
                                         '%d_%d_img.png' % (global_iter, i)))
                        save_pytorch_tensor_as_img(
                            recon_mean[i].data.cpu(),
                            os.path.join(img_save_path,
                                         '%d_%d_recon.png' % (global_iter, i)))
                        if exp_specs['masked']:
                            save_pytorch_tensor_as_img(
                                enc_mask[i].data.cpu(),
                                os.path.join(
                                    img_save_path,
                                    '%d_%d_enc_mask.png' % (global_iter, i)))
                            # save_pytorch_tensor_as_img(dec_mask[i].data.cpu(), os.path.join(img_save_path, '%d_%d_dec_mask.png'%(global_iter, i)))

                    model.train()

            global_iter += 1
Example #15
0
            discount=0.99,
            soft_target_tau=5e-3,
            target_update_period=1,
            policy_lr=3E-4,
            qf_lr=3E-4,
            reward_scale=1,
            use_automatic_entropy_tuning=True,
        ),
        replay_buffer_kwargs=dict(
            max_size=int(1E6),
            fraction_goals_rollout_goals=.2,
            fraction_goals_env_goals=0,
        ),
        qf_kwargs=dict(
            hidden_sizes=[400, 300],
        ),
        policy_kwargs=dict(
            hidden_sizes=[400, 300],
        ),
    )

    def get_name(v):
        name = '_'.join([v['env_name'], v['algorithm'], v['title']])
        return name
    if variant['save']:
        name = get_name(variant)
        setup_logger(name, variant=variant)
    # optionally set the GPU (default=False)
    ptu.set_gpu_mode(True, gpu_id=0)
    experiment(variant)
Example #16
0
def offpolicy_inference(seed,
                        env_name,
                        det,
                        load_name,
                        evaluation,
                        render,
                        knob_noisy,
                        visionnet_input,
                        env_kwargs,
                        actor_critic=None,
                        verbose=True,
                        pos_control=True,
                        step_skip=4):

    import time
    from gym import wrappers

    print("evaluatin started!")

    filename = str(uuid.uuid4())

    gpu = True

    env, _, _ = prepare_env(env_name, **env_kwargs)

    if not actor_critic:
        snapshot = torch.load(load_name)
        policy = snapshot['evaluation/policy']
    else:
        policy = actor_critic
    if env_name.find('doorenv') > -1:
        policy.knob_noisy = knob_noisy
        policy.nn = env._wrapped_env.nn
        policy.visionnet_input = env_kwargs['visionnet_input']

    epi_counter = 1
    dooropen_counter = 0
    total_time = 0
    test_num = 100

    start_time = int(time.mktime(time.localtime()))

    if gpu:
        set_gpu_mode(True)
    while True:
        # print("new env")
        if env_name.find('doorenv') > -1:
            if evaluation:
                path, door_opened, opening_time = rollout(
                    env,
                    policy,
                    max_path_length=512,
                    render=render,
                    evaluate=evaluation,
                    verbose=True,
                    doorenv=True,
                    pos_control=pos_control,
                    step_skip=step_skip,
                )
                if hasattr(env, "log_diagnostics"):
                    env.log_diagnostics([path])
                logger.dump_tabular()
                # if evaluation:
                # print("1")
                env, _, _ = prepare_env(env_name, **env_kwargs)
                if door_opened:
                    dooropen_counter += 1
                    total_time += opening_time
                    if verbose:
                        print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(
                            epi_counter))
                        eval_print(dooropen_counter, epi_counter, start_time,
                                   total_time)
            else:
                path = rollout(
                    env,
                    policy,
                    max_path_length=512,
                    render=render,
                    evaluate=evaluation,
                    verbose=True,
                    doorenv=True,
                    pos_control=pos_control,
                    step_skip=step_skip,
                )
                if hasattr(env, "log_diagnostics"):
                    env.log_diagnostics([path])
                logger.dump_tabular()

        else:
            path = rollout(
                env,
                policy,
                max_path_length=512,
                doorenv=False,
                render=render,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()

        if evaluation:
            if verbose:
                print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter))
                eval_print(dooropen_counter, epi_counter, start_time,
                           total_time)
            epi_counter += 1

            if env_name.find('door') > -1 and epi_counter > test_num:
                if verbose:
                    print("dooropening counter:", dooropen_counter,
                          " epi counter:", epi_counter)
                    eval_print(dooropen_counter, epi_counter, start_time,
                               total_time)
                break

    opening_rate, opening_timeavg = eval_print(dooropen_counter,
                                               epi_counter - 1, start_time,
                                               total_time)
    return opening_rate, opening_timeavg
Example #17
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if 'eval_policy' in data:
        policy = data['eval_policy']
    elif 'policy' in data:
        policy = data['policy']
    elif 'exploration_policy' in data:
        policy = data['exploration_policy']
    else:
        raise Exception("No policy found in loaded dict. Keys: {}".format(
            data.keys()))
    max_tau = get_max_tau(args)

    env = data['env']

    env.mode("video_env")
    env.decode_goals = True

    if hasattr(env, 'enable_render'):
        # some environments need to be reconfigured for visualization
        env.enable_render()

    if args.gpu:
        set_gpu_mode(True)
        policy.to(ptu.device)
        if hasattr(env, "vae"):
            env.vae.to(ptu.device)
    else:
        # make sure everything is on the CPU
        set_gpu_mode(False)
        policy.cpu()
        if hasattr(env, "vae"):
            env.vae.cpu()

    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    ROWS = 3
    COLUMNS = 6
    dirname = osp.dirname(args.file)
    input_file_name = os.path.splitext(os.path.basename(args.file))[0]
    filename = osp.join(dirname, "video_{}.mp4".format(input_file_name))
    rollout_function = create_rollout_function(
        tdm_rollout,
        init_tau=max_tau,
        observation_key='observation',
        desired_goal_key='desired_goal',
    )
    paths = dump_video(
        env,
        policy,
        filename,
        rollout_function,
        ROWS=ROWS,
        COLUMNS=COLUMNS,
        horizon=args.H,
        dirname_to_save_images=dirname,
        subdirname="rollouts_" + input_file_name,
    )

    if hasattr(env, "log_diagnostics"):
        env.log_diagnostics(paths)
    logger.dump_tabular()
Example #18
0
        ),
        qf_kwargs=dict(
            hidden_sizes=[400, 300],
        ),
        policy_kwargs=dict(
            hidden_sizes=[400, 300],
        ),

        save_video=True,
        dump_video_kwargs=dict(
            save_period=1,
            # imsize=(3, 500, 300),
        )
    )

    ptu.set_gpu_mode("gpu")

    representation_size = 128
    output_classes = 20

    model_class = variant.get('model_class', TimestepPredictionModel)
    model = model_class(
        representation_size,
        # decoder_output_activation=decoder_activation,
        output_classes=output_classes,
        **variant['model_kwargs'],
    )
    # model = torch.nn.DataParallel(model)

    imagenets = [True, False]
    reg_types = ["regression_distance", "latent_distance"]
Example #19
0
def experiment(variant):

    args.grayscale = variant['grayscale']

    def make_my_env(args, rank):
        def thunk():
            _env = grounding_env.GroundingEnv(args,
                                              args.seed + rank,
                                              img_encoder=None,
                                              fixed=False,
                                              manual_set_task=True,
                                              n_stack=variant['n_stack'])
            _env.game_init()
            _env.tasks = _env.sample_tasks(variant['task_params']['n_tasks'],
                                           variants=variant['all_tasks'])
            return _env

        return thunk

    task_params = variant['task_params']
    # env = NormalizedBoxEnv(AntGoalEnv(n_tasks=task_params['n_tasks'], use_low_gear_ratio=task_params['low_gear']))
    env = make_my_env(args, 0)()
    # import time

    # def make_envs():
    #     t0 = time.time()
    #     envs = SubprocVecEnv([make_my_env(args, i) for i in range(10)])
    #     print('TIMING', time.time() - t0)

    # import pdb; pdb.set_trace()

    ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id'])

    tasks = env.get_all_task_idx()

    pix_dim = int(np.prod(env.observation_space.shape))
    obs_dim = variant['algo_params']['obs_emb_dim']
    action_dim = env.action_space.n  # int(np.prod(env.action_space.shape))
    latent_dim = 5
    task_enc_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    reward_dim = 1

    net_size = variant['net_size']
    # start with linear task encoding
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder
    nchan = 1 if variant['grayscale'] else 3
    n_layers = variant['n_layers']

    cnn_enc = CNNEncoder(
        64,
        64,
        nchan * variant['n_stack'],
        obs_dim,
        [8, 4, 3, 3],  #kernels
        [256, 64, 64, 64],  #channels
        [2, 2, 2, 2],  # strides
        [1, 1, 1, 1],  # padding
        # hidden_sizes=[256],
        added_fc_input_size=0,
        batch_norm_conv=False,
        batch_norm_fc=False,
        init_w=1e-4,
        # hidden_init=nn.init.xavier_uniform_,
        # hidden_activation=nn.ReLU(),
        # output_activation=identity,
    )

    task_enc = encoder_model(
        hidden_sizes=[200] *
        n_layers,  # deeper net + higher dim space generalize better
        input_size=obs_dim + action_dim + reward_dim,
        output_size=task_enc_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size] * n_layers,
        input_size=obs_dim + latent_dim,
        output_size=action_dim,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size] * n_layers,
        input_size=obs_dim + latent_dim,
        output_size=action_dim,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size] * n_layers,  #, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size] * n_layers,  # net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )

    agent = ProtoAgent(latent_dim, [task_enc, cnn_enc, policy, qf1, qf2, vf],
                       **variant['algo_params'])

    n_eval_tasks = int(variant['task_params']['n_tasks'] * 0.3)

    algorithm = ProtoSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:-n_eval_tasks]),
        eval_tasks=list(tasks[-n_eval_tasks:]),
        nets=[agent, task_enc, policy, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.to()
    algorithm.train()
Example #20
0
def experiment(variant):

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )

    #low Qs first and then high Qs
    q_list = [[
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=2 * obs_dim + action_dim,
            output_size=1,
        ),
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=2 * obs_dim + action_dim,
            output_size=1,
        )
    ],
              [
                  FlattenMlp(
                      hidden_sizes=[net_size, net_size, net_size],
                      input_size=obs_dim + action_dim + latent_dim,
                      output_size=1,
                  ),
                  FlattenMlp(
                      hidden_sizes=[net_size, net_size, net_size],
                      input_size=obs_dim + action_dim + latent_dim,
                      output_size=1,
                  )
              ]]
    #low vf first and then high vf
    vf_list = [
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=2 * obs_dim,
            output_size=1,
        ),
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=obs_dim + latent_dim,
            output_size=1,
        )
    ]

    #NOTE: Reduced number of hidden layers in h_policy from 3 to 2 (idea being it's not doing as much as the whole policy in PEARL)
    h_policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=obs_dim,
    )
    #NOTE: Kept the 3 layers because f**k it it'll get tons of data
    l_policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size, net_size],
        obs_dim=2 * obs_dim,
        latent_dim=0,
        action_dim=action_dim,
    )
    #TODO Implement BernAgent
    agent = BURNAgent(latent_dim,
                      context_encoder,
                      h_policy,
                      l_policy,
                      c=2,
                      **variant['algo_params'])
    algorithm = BURNSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, q_list, vf_list],
        latent_dim=latent_dim,
        **variant['algo_params'])

    # optionally load pre-trained weights
    #TODO Make sure weights are properly saved
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        q_list[0][0].load_state_dict(
            torch.load(os.path.join(path, 'l_qf1.pth')))
        q_list[0][1].load_state_dict(
            torch.load(os.path.join(path, 'l_qf2.pth')))
        q_list[1][0].load_state_dict(
            torch.load(os.path.join(path, 'h_qf1.pth')))
        q_list[1][1].load_state_dict(
            torch.load(os.path.join(path, 'h_qf2.pth')))
        vf_list[0].load_state_dict(torch.load(os.path.join(path, 'l_vf.pth')))
        vf_list[1].load_state_dict(torch.load(os.path.join(path, 'h_vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        h_policy.load_state_dict(torch.load(os.path.join(path,
                                                         'h_policy.pth')))
        l_policy.load_state_dict(torch.load(os.path.join(path,
                                                         'l_policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['env_name'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
def visualize_policy(args):
    variant_overwrite = dict(
        params_pkl=args.params_pkl,
        num_historical_policies=args.num_historical_policies,
        env_kwargs=dict(
            reward_type='indicator',
            sample_goal=False,
            shape_rewards=False,
            distance_threshold=0.1,
            terminate_upon_success=False,
            terminate_upon_failure=False,
        ))
    if args.logdir == '':
        variant = variant_overwrite
        env = NormalizedBoxEnv(
            ManipulationEnv(**variant_overwrite['env_kwargs']))
        eval_policy = RandomPolicy(env.action_space)
    else:
        env, _, data, variant = load_experiment(args.logdir, variant_overwrite)
        eval_policy = data[
            'eval_policy'] if args.use_deterministic_policy else data['policy']
        if not args.cpu:
            set_gpu_mode(True)
            eval_policy.cuda()
        print("Loaded policy:", eval_policy)

        if 'smm_kwargs' in variant:
            # Iterate through each latent-conditioned policy.
            num_skills = variant['smm_kwargs']['num_skills']
            print('Running SMM policy with {} skills.'.format(num_skills))
            import rlkit.torch.smm.utils as utils

            class PartialPolicy:
                def __init__(polself, policy):
                    polself._policy = policy
                    polself._num_skills = num_skills
                    polself._z = -1
                    polself.reset()

                def get_action(polself, ob):
                    aug_ob = utils.concat_ob_z(ob, polself._z,
                                               polself._num_skills)
                    return polself._policy.get_action(aug_ob)

                def sample_skill(polself):
                    z = np.random.choice(polself._num_skills)
                    return z

                def reset(polself):
                    polself._z = (polself._z + 1) % polself._num_skills
                    print("Using skill z:", polself._z)
                    return polself._policy.reset()

            eval_policy = PartialPolicy(eval_policy)

    paths = []
    for _ in range(args.num_episodes):
        eval_policy.reset()
        path = rollout(
            env,
            eval_policy,
            max_path_length=args.max_path_length,
            animated=(not args.norender),
        )
        paths.append(path)
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics(paths)
        if hasattr(env, "get_diagnostics"):
            diagnostics = env.get_diagnostics(paths)
            for key, val in diagnostics.items():
                logger.record_tabular(key, val)
            logger.dump_tabular(with_prefix=False, with_timestamp=False)
    if hasattr(env, "draw"):
        env.draw(paths, save_dir="")
Example #22
0
def experiment(variant, seed=None):

    # create multi-task environment and sample tasks, normalize obs if provided with 'normalizer.npz'
    if 'normalizer.npz' in os.listdir(variant['algo_params']['data_dir']):
        obs_absmax = np.load(os.path.join(variant['algo_params']['data_dir'], 'normalizer.npz'))['abs_max']
        env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']), obs_absmax=obs_absmax)
    else:
        env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    
    if seed is not None:
        global_seed(seed)
        env.seed(seed)

    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant['algo_params']['use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
        output_activation=torch.tanh,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )

    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )

    agent = PEARLAgent(
        latent_dim,
        context_encoder,
        policy,
        **variant['algo_params']
    )
    if variant['algo_type'] == 'FOCAL':
        # critic network for divergence in dual form (see BRAC paper https://arxiv.org/abs/1911.11361)
        c = FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=1
        )
        if 'randomize_tasks' in variant.keys() and variant['randomize_tasks']:
            rng = default_rng()
            train_tasks = rng.choice(len(tasks), size=variant['n_train_tasks'], replace=False)
            eval_tasks = set(range(len(tasks))).difference(train_tasks)
            if 'goal_radius' in variant['env_params']:
                algorithm = FOCALSoftActorCritic(
                    env=env,
                    train_tasks=train_tasks,
                    eval_tasks=eval_tasks,
                    nets=[agent, qf1, qf2, vf, c],
                    latent_dim=latent_dim,
                    goal_radius=variant['env_params']['goal_radius'],
                    **variant['algo_params']
                )
            else:
                algorithm = FOCALSoftActorCritic(
                    env=env,
                    train_tasks=list(tasks[:variant['n_train_tasks']]),
                    eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
                    nets=[agent, qf1, qf2, vf, c],
                    latent_dim=latent_dim,
                    **variant['algo_params']
                )
        else:
            if 'goal_radius' in variant['env_params']:
                algorithm = FOCALSoftActorCritic(
                    env=env,
                    train_tasks=list(tasks[:variant['n_train_tasks']]),
                    eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
                    nets=[agent, qf1, qf2, vf, c],
                    latent_dim=latent_dim,
                    goal_radius=variant['env_params']['goal_radius'],
                    **variant['algo_params']
                )
            else:
                algorithm = FOCALSoftActorCritic(
                    env=env,
                    train_tasks=list(tasks[:variant['n_train_tasks']]),
                    eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
                    nets=[agent, qf1, qf2, vf, c],
                    latent_dim=latent_dim,
                    **variant['algo_params']
                )
    else:
        NotImplemented

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['env_name'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'],
        seed=seed,
        snapshot_mode="all"
    )

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
Example #23
0
        version="normal",
        env_name=args.env,
        layer_size=256,
        replay_buffer_size=int(1E6),
        algorithm_kwargs=dict(
            num_epochs=3000,
            num_eval_steps_per_epoch=5000,
            num_trains_per_train_loop=1000,
            num_expl_steps_per_train_loop=1000,
            min_num_steps_before_training=1000,
            max_path_length=1000,
            batch_size=256,
        ),
        trainer_kwargs=dict(
            discount=0.99,
            soft_target_tau=5e-3,
            target_update_period=1,
            policy_lr=3E-4,
            qf_lr=3E-4,
            reward_scale=1,
            use_automatic_entropy_tuning=True,
        ),
    )
    exp_dir = '{}'.format(args.env)
    print('experiment dir:logs/{}'.format(exp_dir))
    setup_logger(variant=variant, log_dir='logs/{}'.format(exp_dir))
    ptu.set_gpu_mode(True, gpu_id=args.device_id)
    print('using gpu:{}'.format(args.device_id))
    # ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    experiment(variant)
Example #24
0
def setup_and_run(variant):

    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['seed'] % variant['util_params']['num_gpus'])
    #setup env
    env_name = variant['env_name']
    env_params = variant['env_params']
    env_params['n_tasks'] = variant["n_train_tasks"] + variant["n_eval_tasks"]
    env = NormalizedBoxEnv(ENVS[env_name](**env_params))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    latent_dim = variant['latent_size']
    reward_dim = 1

    #setup encoder
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )

    #setup actor, critic
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])

    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(np.arange(variant['n_train_tasks'])),
        eval_tasks=list(
            np.arange(variant['n_train_tasks'],
                      variant['n_train_tasks'] + variant['n_eval_tasks'])),
        nets=[agent, qf1, qf2, target_qf1, target_qf2],
        latent_dim=latent_dim,
        **variant['algo_params'])
    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))

        target_qf1.load_state_dict(
            torch.load(os.path.join(path, 'target_qf1.pth')))
        target_qf2.load_state_dict(
            torch.load(os.path.join(path, 'target_qf2.pth')))

        # TODO hacky, revisit after model refactor
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    if ptu.gpu_enabled():
        algorithm.to()

    os.environ['DEBUG'] = str(int(variant['util_params']['debug']))

    #setup logger
    run_mode = variant['run_mode']
    exp_log_name = os.path.join(
        variant['env_name'], run_mode,
        variant['log_annotation'] + variant['variant_name'],
        'seed-' + str(variant['seed']))

    setup_logger(exp_log_name,
                 variant=variant,
                 exp_id=None,
                 base_log_dir=os.environ.get('PEARL_DATA_PATH'),
                 snapshot_mode='gap',
                 snapshot_gap=10)

    # run the algorithm
    if run_mode == 'TRAIN':
        algorithm.train()
    elif run_mode == 'EVAL':
        assert variant['algo_params']['dump_eval_paths'] == True
        algorithm._try_to_eval()
    else:
        algorithm.eval_with_loaded_latent()
Example #25
0
def experiment(variant):

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['env_name'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
Example #26
0
def experiment(variant):
    task_params = variant['task_params']
    env = NormalizedBoxEnv(
        AntGoalEnv(n_tasks=task_params['n_tasks'],
                   use_low_gear_ratio=task_params['low_gear']))
    ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id'])

    tasks = env.get_all_task_idx()

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    latent_dim = 5
    task_enc_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    reward_dim = 1

    net_size = variant['net_size']
    # start with linear task encoding
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder
    task_enc = encoder_model(
        hidden_sizes=[200, 200,
                      200],  # deeper net + higher dim space generalize better
        input_size=obs_dim + action_dim + reward_dim,
        output_size=task_enc_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )

    agent = ProtoAgent(latent_dim, [task_enc, policy, qf1, qf2, vf],
                       **variant['algo_params'])

    algorithm = ProtoSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:-30]),
        eval_tasks=list(tasks[-30:]),
        nets=[agent, task_enc, policy, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.to()
    algorithm.train()
Example #27
0
            disc_kwargs=dict(
                batch_size=256,
                num_batches_per_fit=1,
                num_skills=args.num_skills,
                sampling_strategy=sampling_strategy,
                sampling_window=10,
            ),
            env_kwargs=dict(
                reward_params=dict(type=algo),
                unsupervised_reward_weight=args.unsupervised_reward_weight,
                reward_weight=args.environment_reward_weight),
            net_size=300,
            experiment=args.algo,
        )

    ptu.set_gpu_mode(True, 0)  # optionally set the GPU (default=False)
    if algo == 'wrapped_env':
        setup_logger(
            'CAMERA_READY_EXPERIMENTS/{}/env_weight_{}/seed{}/replay_buffer_size_{}/num_skills_{}/target_entropy_multiplier_{}/action_noise_{}'
            .format(args.env, args.environment_reward_weight, args.seed,
                    args.replay_buffer_size, args.num_skills,
                    args.target_entropy_multiplier, args.noise_scale),
            variant=variant)
    elif algo == 'diayn':
        setup_logger(
            'CAMERA_READY_EXPERIMENTS/{}/unsupervised_weight_{}/seed{}/replay_buffer_size_{}/num_skills_{}/target_entropy_multiplier_{}/action_noise_{}'
            .format(args.env, args.unsupervised_reward_weight, args.seed,
                    args.replay_buffer_size, args.num_skills,
                    args.target_entropy_multiplier, args.noise_scale),
            variant=variant)
    else:
                             labels[num_context_points:]).type(
                                 torch.FloatTensor).mean()

            print('Meta-Test Loss: %.4f' % loss)
            print('Meta-Test Acc Ctxt: %.4f' % context_accuracy)
            print('Meta-Test Acc Test: %.4f' % test_accuracy)
            model.train()

    return 1


if __name__ == '__main__':
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--experiment',
                        help='experiment specification file')
    args = parser.parse_args()
    with open(args.experiment, 'r') as spec_file:
        spec_string = spec_file.read()
        exp_specs = yaml.load(spec_string)

    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)
    if exp_specs['use_gpu']: ptu.set_gpu_mode(True)

    experiment(exp_specs)
        ),
        policy_kwargs=dict(
            hidden_dim=args.hidden,
            num_layer=args.layer,
        ),
        replay_buffer_size=int(1E6),
    )
    import os
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    with open(osp.join(log_dir, 'variant.json'), 'w') as out_json:
        import json
        json.dump(variant, out_json, indent=2)
    import sys
    cmd_input = 'python ' + ' '.join(sys.argv) + '\n'
    with open(osp.join(log_dir, 'cmd_input.txt'), 'a') as f:
        f.write(cmd_input)
    setup_logger(args.exp_name + '/' + main_dir,
                 variant=variant,
                 snapshot_mode=args.snapshot_mode,
                 snapshot_gap=args.snapshot_gap,
                 log_dir=log_dir)
    import numpy as np
    import torch
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if isinstance(args.gpu, int):
        print('using gpu ', args.gpu)
        ptu.set_gpu_mode(True, gpu_id=args.gpu)
    experiment(variant)
Example #30
0
from agent.agent import Agent  # TODO better naming here
from rlkit.torch.sac.sac import SACTrainer
from rlkit.torch.networks import FlattenMlp
from rlkit.torch.sac.policies import TanhGaussianPolicy
import torch
import rlkit.torch.pytorch_util as torch_util
from agent.mem import Mem
import os
import pickle
import config as run_config

if torch.cuda.is_available():
    torch_util.set_gpu_mode(True)

log = run_config.log()


class SAC(Agent):
    def __init__(self, env, eval_env, mem, nets, train_step_params):
        super().__init__(env, eval_env, mem, nets, train_step_params)
        self._mem = mem

        self._env = env
        self._eval_env = eval_env

        self._policy_net, self._q1_net, self._q2_net, self._target_q1_net,\
        self._target_q2_net = nets['policy_net'], nets['q1_net'], nets['q2_net'],\
                              nets['target_q1_net'], nets['target_q2_net']

        self._train_step_params = train_step_params