Esempio n. 1
0
def run_task(*_):
    env = normalize(
        GymEnv("DartHopper-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

    algo = PPO_Clip_Sym(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=20000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        whole_paths=False,
        observation_permutation=np.array(
            [0.0001, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
        action_permutation=np.array([0.0001, 1, 2]),
        sym_loss_weight=0.0,
    )
    algo.train()
Esempio n. 2
0
def test_trpo_relu_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_nonlinearity=naive_relu,
        hidden_sizes=(1,))
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100,
        step_size=0.001
    )
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
Esempio n. 3
0
def test_trpo_deterministic_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(1,))
    policy._l_log_std.param.set_value([np.float32(np.log(1e-8))])
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100,
        step_size=0.01
    )
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
Esempio n. 4
0
def run_task(*_):
    env = normalize(
        GymEnv("DartWalker3d-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )
    #policy = joblib.load('data/local/experiment/walker3d_symmetry1_sd13_2alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_2d_hardvelenforce_contsupport/policy.pkl')

    # increase policy std a bit for exploration
    #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5)

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)


    algo = TRPO_Symmetry(
        env=env,
        policy=policy,
        baseline=baseline,

        batch_size=60000,

        max_path_length=env.horizon,
        n_itr=500,

        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        observation_permutation=np.array([0.0001,-1, 2,-3,-4, -5,-6,7, 14,-15,-16, 17, 18,-19, 8,-9,-10, 11, 12,-13,\
                                          20,21,-22, 23,-24,-25, -26,-27,28, 35,-36,-37, 38, 39,-40, 29,-30,-31, 32, 33,-34, 42, 41]),
        #observation_permutation=np.array([0.0001, 1, 5,6,7, 2,3,4, 8,9,10, 14,15,16, 11,12,13]),
        #action_permutation=np.array([3,4,5, 0.00001,1,2]),
        action_permutation=np.array([-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]),

        sym_loss_weight=2.0,
        whole_paths=False,
    )
    algo.train()
Esempio n. 5
0
def run_task(*_):
    env = normalize(
        GymEnv("DartHumanWalker-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )
    #policy = joblib.load('data/local/experiment/humanwalker_symmetry1_sd11_1alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_spd20002000/policy.pkl')

    # increase policy std a bit for exploration
    #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5)

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)


    algo = TRPO_Symmetry(
        env=env,
        policy=policy,
        baseline=baseline,

        batch_size=50000,

        max_path_length=env.horizon,
        n_itr=1000,

        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\
                                          28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57]),
        action_permutation=np.array([-6,7,-8, 9, 10,11,  -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]),

        sym_loss_weight=1.0,
        action_reg_weight=0.0,
        whole_paths=False,
    )
    algo.train()
Esempio n. 6
0
    outputs = None,
    updates = sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4], params, learning_rate=learning_rate)
)

f_baseline_g = theano.function(
    inputs = [observations_var, actions_var],
    outputs = all_der
)
alla = []
for est in range(10):
    if (load_policy):
        policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True)        
    avg_return = np.zeros(n_itr)
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    for j_int in range(n_itr):
        paths = parallel_sampler.sample_paths_on_trajectories(policy.get_param_values(),N,T,show_bar=False)
        #baseline.fit(paths)
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        temp = list()
        for x in d_rewards:
            z=list()
            t=1
            for y in x:
                z.append(y*t)
                t*=discount
            temp.append(np.array(z))
        d_rewards=temp
        minT=T
        cum_num = []
def perform_evaluation(num_parallel,
                       hidden_size,
                       batch_size,
                       pathlength,
                       random_split,
                       prioritized_split,
                       adaptive_sample,
                       initialize_epochs,
                       grad_epochs,
                       test_epochs,
                       append,
                       task_size,
                       load_init_policy,
                       load_split_data,
                       alternate_update,
                       accumulate_gradient,
                       imbalance_sample,
                       sample_ratio,
                       split_percentages,
                       env_name,
                       seed,
                       test_num=1,
                       param_update_start=50,
                       param_update_frequency=50,
                       param_update_end=200,
                       use_param_variance=0,
                       param_variance_batch=10000,
                       param_variance_sample=100,
                       reverse_metric=False):
    reps = 1

    learning_curves = []
    kl_divergences = []
    for i in range(len(split_percentages)):
        learning_curves.append([])
        kl_divergences.append([])

    performances = []

    diretory = 'data/trained/gradient_temp/rl_split_' + append

    if not os.path.exists(diretory):
        os.makedirs(diretory)
        os.makedirs(diretory + '/policies')

    for testit in range(test_num):
        print('======== Start Test ', testit, ' ========')
        env = normalize(GymEnv(env_name, record_log=False, record_video=False))
        dartenv = env._wrapped_env.env.env
        if env._wrapped_env.monitoring:
            dartenv = dartenv.env

        np.random.seed(testit * 3 + seed)
        random.seed(testit * 3 + seed)

        pre_training_learning_curve = []

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=hidden_size,
            # append_dim=2,
            net_mode=0,
        )
        baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

        if load_init_policy:
            policy = joblib.load(diretory + '/init_policy.pkl')

        if adaptive_sample:
            new_batch_size = int(batch_size / task_size)
        else:
            new_batch_size = batch_size

        algo = TRPO(  # _MultiTask(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=new_batch_size,
            max_path_length=pathlength,
            n_itr=5,
            discount=0.995,
            step_size=0.02,
            gae_lambda=0.97,
            whole_paths=False,
            # task_num=task_size,
        )
        algo.init_opt()

        from rllab.sampler import parallel_sampler

        parallel_sampler.initialize(n_parallel=num_parallel)
        parallel_sampler.set_seed(0)

        algo.start_worker()

        if not load_init_policy:
            for i in range(initialize_epochs):
                print('------ Iter ', i, ' in Init Training --------')
                if adaptive_sample:
                    paths = []
                    reward_paths = []
                    for t in range(task_size):
                        paths += algo.sampler.obtain_samples(0, t)
                        #reward_paths += algo.sampler.obtain_samples(0)
                elif imbalance_sample:
                    paths = []
                    reward_paths = []
                    for t in range(task_size):
                        algo.batch_size = batch_size * sample_ratio[t]
                        task_path = algo.sampler.obtain_samples(0, t)
                        paths += task_path
                        if t == 0:
                            reward_paths += task_path
                else:
                    paths = algo.sampler.obtain_samples(0)
                samples_data = algo.sampler.process_samples(0, paths)
                opt_data = algo.optimize_policy(0, samples_data)
                pol_aft = (policy.get_param_values())
                print(algo.mean_kl(samples_data))

                print(dict(logger._tabular)['AverageReturn'])
                pre_training_learning_curve.append(
                    dict(logger._tabular)['AverageReturn'])
            joblib.dump(policy, diretory + '/init_policy.pkl', compress=True)

        print('------- initial training complete ---------------')
        if not load_split_data:
            split_data = []
            net_weights = []
            net_weight_values = []
            for i in range(grad_epochs):
                cur_param_val = np.copy(policy.get_param_values())
                cur_param = copy.deepcopy(policy.get_params())

                cp = []
                for param in policy._mean_network.get_params():
                    cp.append(np.copy(param.get_value()))
                net_weights.append(cp)
                net_weight_values.append(np.copy(policy.get_param_values()))

                if adaptive_sample:
                    paths = []
                    reward_paths = []
                    for t in range(task_size):
                        paths += algo.sampler.obtain_samples(0, t)
                        #reward_paths += algo.sampler.obtain_samples(0)
                elif imbalance_sample:
                    paths = []
                    reward_paths = []
                    for t in range(task_size):
                        algo.batch_size = batch_size * sample_ratio[t]
                        task_path = algo.sampler.obtain_samples(0, t)
                        paths += task_path
                        if t == 0:
                            reward_paths += task_path
                else:
                    paths = algo.sampler.obtain_samples(0)
                split_data.append(paths)

                samples_data = algo.sampler.process_samples(0, paths)
                opt_data = algo.optimize_policy(0, samples_data)
                pre_training_learning_curve.append(
                    dict(logger._tabular)['AverageReturn'])
            joblib.dump(split_data,
                        diretory + '/split_data.pkl',
                        compress=True)
            joblib.dump(net_weights,
                        diretory + '/net_weights.pkl',
                        compress=True)
            joblib.dump(net_weight_values,
                        diretory + '/net_weight_values.pkl',
                        compress=True)
            joblib.dump(pre_training_learning_curve,
                        diretory + '/pretrain_learningcurve_' + str(testit) +
                        '.pkl',
                        compress=True)
        else:
            split_data = joblib.load(diretory + '/split_data.pkl')
            net_weights = joblib.load(diretory + '/net_weights.pkl')
            net_weight_values = joblib.load(diretory +
                                            '/net_weight_values.pkl')
            pre_training_learning_curve = joblib.load(
                diretory + '/pretrain_learningcurve_' + str(testit) + '.pkl')

        task_grads = []
        variance_grads = []
        for i in range(task_size):
            task_grads.append([])
        for i in range(grad_epochs):
            policy.set_param_values(net_weight_values[i])
            task_paths = []
            for j in range(task_size):
                task_paths.append([])
            for path in split_data[i]:
                taskid = path['env_infos']['state_index'][-1]
                task_paths[taskid].append(path)

            for j in range(task_size):
                samples_data = algo.sampler.process_samples(
                    0, task_paths[j], False)
                grad = get_gradient(algo, samples_data, False)
                task_grads[j].append(grad)
            if use_param_variance == 1 and i == grad_epochs - 1:
                for j in range(param_variance_sample):
                    samples_data_ori = algo.sampler.process_samples(
                        0, split_data[i], False)
                    samples_data = {}
                    indices = np.arange(len(samples_data_ori['observations']))
                    np.random.shuffle(indices)
                    samples_data["observations"] = samples_data_ori[
                        "observations"][indices[0:param_variance_batch]]
                    samples_data["actions"] = samples_data_ori["actions"][
                        indices[0:param_variance_batch]]
                    samples_data["rewards"] = samples_data_ori["rewards"][
                        indices[0:param_variance_batch]]
                    samples_data["advantages"] = samples_data_ori[
                        "advantages"][indices[0:param_variance_batch]]
                    samples_data["agent_infos"] = {}
                    samples_data["agent_infos"]["log_std"] = samples_data_ori[
                        "agent_infos"]["log_std"][
                            indices[0:param_variance_batch]]
                    samples_data["agent_infos"]["mean"] = samples_data_ori[
                        "agent_infos"]["mean"][indices[0:param_variance_batch]]
                    grad = get_gradient(algo, samples_data, False)
                    variance_grads.append(grad)
            algo.sampler.process_samples(0, split_data[i])

        weight_variances = []
        for i in range(len(task_grads[0][0]) - 1):
            weight_variances.append(np.zeros(task_grads[0][0][i].shape))
        if use_param_variance == 1:
            for k in range(len(task_grads[0][0]) - 1):
                one_grad = []
                for g in range(len(variance_grads)):
                    one_grad.append(np.asarray(variance_grads[g][k]))
                weight_variances[k] += np.var(one_grad, axis=0)

        print('------- collected gradient info -------------')

        split_counts = []
        for i in range(len(task_grads[0][0]) - 1):
            split_counts.append(np.zeros(task_grads[0][0][i].shape))

        for i in range(len(task_grads[0])):
            for k in range(len(task_grads[0][i]) - 1):
                region_gradients = []
                for region in range(len(task_grads)):
                    region_gradients.append(task_grads[region][i][k])
                region_gradients = np.array(region_gradients)
                if not random_split:
                    split_counts[k] += np.var(
                        region_gradients, axis=0
                    )  # * np.abs(net_weights[i][k])# + 100 * (len(task_grads[0][i])-k)
                elif prioritized_split:
                    split_counts[k] += np.random.random(
                        split_counts[k].shape) * (len(task_grads[0][i]) - k)
                else:
                    split_counts[k] += np.random.random(split_counts[k].shape)

        for j in range(len(split_counts)):
            plt.figure()
            plt.title(policy._mean_network.get_params()[j].name)
            if len(split_counts[j].shape) == 2:
                plt.imshow(split_counts[j])
                plt.colorbar()
            elif len(split_counts[j].shape) == 1:
                plt.plot(split_counts[j])

            plt.savefig(diretory + '/' +
                        policy._mean_network.get_params()[j].name + '.png')

            if use_param_variance:
                plt.figure()
                plt.title(policy._mean_network.get_params()[j].name)
                if len(weight_variances[j].shape) == 2:
                    plt.imshow(weight_variances[j])
                    plt.colorbar()
                elif len(weight_variances[j].shape) == 1:
                    plt.plot(weight_variances[j])

                plt.savefig(diretory + '/' +
                            policy._mean_network.get_params()[j].name +
                            '_variances.png')

        algo.shutdown_worker()

        # organize the metric into each edges and sort them
        split_metrics = []
        metrics_list = []
        variance_list = []
        for k in range(len(task_grads[0][0]) - 1):
            for index, value in np.ndenumerate(split_counts[k]):
                split_metrics.append(
                    [k, index, value, weight_variances[k][index]])
                metrics_list.append(value)
                variance_list.append(weight_variances[k][index])
        if use_param_variance == 0:
            split_metrics.sort(key=lambda x: x[2], reverse=True)
        else:
            split_metrics.sort(key=lambda x: x[3], reverse=True)

        # test the effect of splitting
        total_param_size = len(policy._mean_network.get_param_values())

        pred_list = []
        # use the optimized network
        init_param_value = np.copy(policy.get_param_values())

        for split_id, split_percentage in enumerate(split_percentages):
            split_param_size = split_percentage * total_param_size
            masks = []
            for k in range(len(task_grads[0][0]) - 1):
                masks.append(np.zeros(split_counts[k].shape))

            if split_percentage <= 1.0:
                for i in range(int(split_param_size)):
                    masks[split_metrics[i][0]][split_metrics[i][1]] = 1
            else:
                threshold = np.mean(metrics_list) + np.std(metrics_list)
                print('threashold,', threshold)
                for i in range(len(split_metrics)):
                    if split_metrics[i][2] < threshold:
                        break
                    else:
                        masks[split_metrics[i][0]][split_metrics[i][1]] = 1

            mask_split_flat = np.array([])
            for k in range(int((len(task_grads[0][0]) - 1) / 2)):
                for j in range(task_size):
                    mask_split_flat = np.concatenate([
                        mask_split_flat,
                        np.array(masks[k * 2]).flatten(),
                        np.array(masks[k * 2 + 1]).flatten()
                    ])
            mask_share_flat = np.ones(len(mask_split_flat))
            mask_share_flat -= mask_split_flat
            if np.abs(split_percentage - 1.0) < 0.0001:
                mask_split_flat = np.concatenate(
                    [mask_split_flat,
                     np.ones(dartenv.act_dim * task_size)])
                mask_share_flat = np.concatenate(
                    [mask_share_flat,
                     np.zeros(dartenv.act_dim * task_size)])
            else:
                mask_split_flat = np.concatenate(
                    [mask_split_flat,
                     np.zeros(dartenv.act_dim)])
                mask_share_flat = np.concatenate(
                    [mask_share_flat,
                     np.ones(dartenv.act_dim)])

            policy.set_param_values(init_param_value)
            if split_param_size != 0:
                if dartenv.avg_div != task_size:
                    dartenv.avg_div = task_size
                    dartenv.obs_dim += dartenv.avg_div
                    high = np.inf * np.ones(dartenv.obs_dim)
                    low = -high
                    dartenv.observation_space = spaces.Box(low, high)
                    env._wrapped_env._observation_space = rllab.envs.gym_env.convert_gym_space(
                        dartenv.observation_space)
                    env.spec = rllab.envs.env_spec.EnvSpec(
                        observation_space=env.observation_space,
                        action_space=env.action_space,
                    )

                split_policy = GaussianMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=hidden_size,
                    # append_dim=2,
                    net_mode=8,
                    split_num=task_size,
                    split_masks=masks,
                    split_init_net=policy,
                    split_std=np.abs(split_percentage - 1.0) < 0.0001,
                )
            else:
                split_policy = copy.deepcopy(policy)

            if split_param_size == 0:
                baseline_add = 0
            else:
                baseline_add = task_size  # use 0 for now, though task_size should in theory improve performance more
            split_baseline = LinearFeatureBaseline(env_spec=env.spec,
                                                   additional_dim=baseline_add)

            new_batch_size = batch_size
            if (split_param_size != 0 and alternate_update) or adaptive_sample:
                new_batch_size = int(batch_size / task_size)
            split_algo = TRPO(  # _MultiTask(
                env=env,
                policy=split_policy,
                baseline=split_baseline,
                batch_size=new_batch_size,
                max_path_length=pathlength,
                n_itr=5,
                discount=0.995,
                step_size=0.02,
                gae_lambda=0.97,
                whole_paths=False,
                # task_num=task_size,
            )
            split_algo.init_opt()

            parallel_sampler.initialize(n_parallel=num_parallel)
            parallel_sampler.set_seed(0)

            split_algo.start_worker()
            if split_param_size != 0:
                parallel_sampler.update_env_params({
                    'avg_div':
                    dartenv.avg_div,
                    'obs_dim':
                    dartenv.obs_dim,
                    'observation_space':
                    dartenv.observation_space
                })

            print('Network parameter size: ', total_param_size,
                  len(split_policy.get_param_values()))

            split_init_param = np.copy(split_policy.get_param_values())
            avg_error = 0.0

            avg_learning_curve = []
            for rep in range(int(reps)):
                split_policy.set_param_values(split_init_param)
                learning_curve = []
                kl_div_curve = []
                for i in range(test_epochs):
                    # if not split
                    if split_param_size == 0:
                        paths, _ = get_samples(split_algo, task_size,
                                               adaptive_sample,
                                               imbalance_sample, batch_size,
                                               sample_ratio)
                        # sanity check
                        samp_num = 0
                        for p in paths:
                            samp_num += len(p['observations'])
                        print('samp_num: ', samp_num, adaptive_sample,
                              imbalance_sample)
                        samples_data = split_algo.sampler.process_samples(
                            0, paths)
                        opt_data = split_algo.optimize_policy(0, samples_data)

                        if imbalance_sample:
                            reward = 0
                            for path in reward_paths:
                                reward += np.sum(path["rewards"])
                            reward /= len(reward_paths)
                        else:
                            reward = float(
                                (dict(logger._tabular)['AverageReturn']))
                        kl_div_curve.append(split_algo.mean_kl(samples_data))
                        print('reward: ', reward)
                        print(split_algo.mean_kl(samples_data))
                    elif alternate_update:
                        reward = 0
                        total_traj = 0
                        task_rewards = []
                        for j in range(task_size):
                            paths = split_algo.sampler.obtain_samples(0, j)
                            # split_algo.sampler.process_samples(0, paths)
                            samples_data = split_algo.sampler.process_samples(
                                0, paths)
                            opt_data = split_algo.optimize_policy(
                                0, samples_data)
                            reward += float((dict(
                                logger._tabular)['AverageReturn'])) * float(
                                    (dict(logger._tabular)['NumTrajs']))
                            total_traj += float(
                                (dict(logger._tabular)['NumTrajs']))
                            task_rewards.append(
                                dict(logger._tabular)['AverageReturn'])
                        reward /= total_traj
                        print('reward for different tasks: ', task_rewards,
                              reward)
                    elif accumulate_gradient:
                        paths, _ = get_samples(split_algo, task_size,
                                               adaptive_sample,
                                               imbalance_sample, batch_size,
                                               sample_ratio)

                        task_paths = []
                        task_rewards = []
                        for j in range(task_size):
                            task_paths.append([])
                            task_rewards.append([])
                        for path in paths:
                            taskid = path['env_infos']['state_index'][-1]
                            task_paths[taskid].append(path)
                            task_rewards[taskid].append(np.sum(
                                path['rewards']))
                        pre_opt_parameter = np.copy(
                            split_policy.get_param_values())

                        # compute the split gradient first
                        split_policy.set_param_values(pre_opt_parameter)
                        accum_grad = np.zeros(pre_opt_parameter.shape)
                        processed_task_data = []
                        for j in range(task_size):
                            if len(task_paths[j]) == 0:
                                processed_task_data.append([])
                                continue
                            split_policy.set_param_values(pre_opt_parameter)
                            # split_algo.sampler.process_samples(0, task_paths[j])
                            samples_data = split_algo.sampler.process_samples(
                                0, task_paths[j], False)
                            processed_task_data.append(samples_data)
                            #split_algo.optimize_policy(0, samples_data)

                            # if j == 1:
                            accum_grad += split_policy.get_param_values(
                            ) - pre_opt_parameter
                        # sanity check
                        samp_num = 0
                        for p in paths:
                            samp_num += len(p['observations'])
                        print('samp_num: ', samp_num)

                        # compute the gradient together
                        split_policy.set_param_values(pre_opt_parameter)
                        all_data = split_algo.sampler.process_samples(0, paths)
                        if imbalance_sample:
                            reward = 0
                            for path in reward_paths:
                                reward += np.sum(path["rewards"])
                            reward /= len(reward_paths)
                        else:
                            reward = float(
                                (dict(logger._tabular)['AverageReturn']))

                        split_algo.optimize_policy(0, all_data)
                        all_data_grad = split_policy.get_param_values(
                        ) - pre_opt_parameter

                        # do a line search to project the udpate onto the constraint manifold
                        sum_grad = all_data_grad  # * mask_split_flat + all_data_grad * mask_share_flat

                        ls_steps = []
                        loss_before = split_algo.loss(all_data)

                        for s in range(50):
                            ls_steps.append(0.97**s)
                        for step in ls_steps:
                            split_policy.set_param_values(pre_opt_parameter +
                                                          sum_grad * step)
                            if split_algo.mean_kl(
                                    all_data
                            )[0] < split_algo.step_size:  # and split_algo.loss(all_data)[0] < loss_before[0]:
                                break
                        # step=1

                        split_policy.set_param_values(pre_opt_parameter +
                                                      sum_grad * step)

                        for j in range(task_size):
                            task_rewards[j] = np.mean(task_rewards[j])

                        print('reward for different tasks: ', task_rewards,
                              reward)
                        print('mean kl: ', split_algo.mean_kl(all_data),
                              ' step size: ', step)
                        task_mean_kls = []
                        for j in range(task_size):
                            if len(processed_task_data[j]) == 0:
                                task_mean_kls.append(0)
                            else:
                                task_mean_kls.append(
                                    split_algo.mean_kl(
                                        processed_task_data[j])[0])
                        print('mean kl for different tasks: ', task_mean_kls)
                        kl_div_curve.append(
                            np.concatenate(
                                [split_algo.mean_kl(all_data), task_mean_kls]))
                    else:
                        paths = split_algo.sampler.obtain_samples(0)
                        reward = float(
                            (dict(logger._tabular)['AverageReturn']))
                        task_paths = []
                        task_rewards = []
                        for j in range(task_size):
                            task_paths.append([])
                            task_rewards.append([])
                        for path in paths:
                            taskid = path['env_infos']['state_index'][-1]
                            task_paths[taskid].append(path)
                            task_rewards[taskid].append(np.sum(
                                path['rewards']))
                        pre_opt_parameter = np.copy(
                            split_policy.get_param_values())
                        # optimize the shared part
                        # split_algo.sampler.process_samples(0, paths)
                        samples_data = split_algo.sampler.process_samples(
                            0, paths)
                        for layer in split_policy._mean_network._layers:
                            for param in layer.get_params():
                                if 'split' in param.name:
                                    layer.params[param].remove('trainable')
                        split_policy._cached_params = {}
                        split_policy._cached_param_dtypes = {}
                        split_policy._cached_param_shapes = {}
                        split_algo.init_opt()
                        print(
                            'Optimizing shared parameter size: ',
                            len(split_policy.get_param_values(trainable=True)))
                        split_algo.optimize_policy(0, samples_data)

                        # optimize the tasks
                        for layer in split_policy._mean_network._layers:
                            for param in layer.get_params():
                                if 'split' in param.name:
                                    layer.params[param].add('trainable')
                                if 'share' in param.name:
                                    layer.params[param].remove('trainable')

                        # shuffle the optimization order
                        opt_order = np.arange(task_size)
                        np.random.shuffle(opt_order)
                        split_policy._cached_params = {}
                        split_policy._cached_param_dtypes = {}
                        split_policy._cached_param_shapes = {}
                        split_algo.init_opt()
                        for taskid in opt_order:
                            # split_algo.sampler.process_samples(0, task_paths[taskid])
                            samples_data = split_algo.sampler.process_samples(
                                0, task_paths[taskid])
                            print(
                                'Optimizing parameter size: ',
                                len(
                                    split_policy.get_param_values(
                                        trainable=True)))
                            split_algo.optimize_policy(0, samples_data)
                        for layer in split_policy._mean_network._layers:
                            for param in layer.get_params():
                                if 'share' in param.name:
                                    layer.params[param].add('trainable')

                        for j in range(task_size):
                            task_rewards[j] = np.mean(task_rewards[j])
                        print('reward for different tasks: ', task_rewards,
                              reward)

                    learning_curve.append(reward)
                    if (i + initialize_epochs +
                            grad_epochs) % param_update_frequency == 0 and (
                                i + initialize_epochs +
                                grad_epochs) < param_update_end and (
                                    i + initialize_epochs +
                                    grad_epochs) > param_update_start:
                        print("Updating model parameters...")
                        parallel_sampler.update_env_params(
                            {'task_expand_flag': True})
                    print('============= Finished ', split_percentage, ' Rep ',
                          rep, '   test ', i, ' ================')
                    print(diretory)
                    joblib.dump(split_policy,
                                diretory + '/policies/policy_' + str(rep) +
                                '_' + str(i) + '_' + str(split_percentage) +
                                '.pkl',
                                compress=True)
                avg_learning_curve.append(learning_curve)
                kl_divergences[split_id].append(kl_div_curve)
                joblib.dump(split_policy,
                            diretory + '/policies/final_policy_' +
                            str(split_percentage) + '.pkl',
                            compress=True)

                avg_error += float(reward)
            pred_list.append(avg_error / reps)
            print(split_percentage, avg_error / reps)
            split_algo.shutdown_worker()
            print(avg_learning_curve)
            avg_learning_curve = np.mean(avg_learning_curve, axis=0)
            learning_curves[split_id].append(avg_learning_curve)
            # output the learning curves so far
            joblib.dump(learning_curves,
                        diretory + '/learning_curve.pkl',
                        compress=True)
            avg_learning_curve = []
            for lc in range(len(learning_curves)):
                avg_learning_curve.append(np.mean(learning_curves[lc], axis=0))
            plt.figure()
            for lc in range(len(learning_curves)):
                plt.plot(avg_learning_curve[lc],
                         label=str(split_percentages[lc]))
            plt.legend(bbox_to_anchor=(0.3, 0.3),
                       bbox_transform=plt.gcf().transFigure,
                       numpoints=1)
            plt.savefig(diretory + '/split_learning_curves.png')

            if len(kl_divergences[0]) > 0:
                #print('kldiv:', kl_divergences)
                avg_kl_div = []
                for i in range(len(kl_divergences)):
                    if len(kl_divergences[i]) > 0:
                        avg_kl_div.append(np.mean(kl_divergences[i], axis=0))
                #print(avg_kl_div)
                joblib.dump(avg_kl_div,
                            diretory + '/kl_divs.pkl',
                            compress=True)
                for i in range(len(avg_kl_div)):
                    one_perc_kl_div = np.array(avg_kl_div[i])
                    #print(i, one_perc_kl_div)
                    plt.figure()
                    for j in range(len(one_perc_kl_div[0])):
                        append = 'task%d' % j
                        if j == 0:
                            append = 'all'
                        plt.plot(one_perc_kl_div[:, j],
                                 label=str(split_percentages[i]) + append,
                                 alpha=0.3)
                    plt.legend(bbox_to_anchor=(0.3, 0.3),
                               bbox_transform=plt.gcf().transFigure,
                               numpoints=1)
                    plt.savefig(diretory +
                                '/kl_div_%s.png' % str(split_percentages[i]))
        performances.append(pred_list)

    np.savetxt(diretory + '/performance.txt', performances)
    plt.figure()
    plt.plot(split_percentages, np.mean(performances, axis=0))
    plt.savefig(diretory + '/split_performance.png')
    joblib.dump(learning_curves,
                diretory + '/learning_curve.pkl',
                compress=True)

    avg_learning_curve = []
    for i in range(len(learning_curves)):
        avg_learning_curve.append(np.mean(learning_curves[i], axis=0))
    plt.figure()
    for i in range(len(split_percentages)):
        plt.plot(avg_learning_curve[i], label=str(split_percentages[i]))
    plt.legend(bbox_to_anchor=(0.3, 0.3),
               bbox_transform=plt.gcf().transFigure,
               numpoints=1)
    plt.savefig(diretory + '/split_learning_curves.png')
    #np.savetxt(diretory + '/learning_curves.txt', avg_learning_curve)

    if len(kl_divergences[0]) > 0:
        avg_kl_div = []
        for i in range(len(kl_divergences)):
            avg_kl_div.append(np.mean(kl_divergences[i], axis=0))
        joblib.dump(avg_kl_div, diretory + '/kl_divs.pkl', compress=True)
        for i in range(len(avg_kl_div)):
            one_perc_kl_div = np.array(avg_kl_div[i])
            plt.figure()
            for j in range(len(one_perc_kl_div[0])):
                append = 'task%d' % j
                if j == 0:
                    append = 'all'
                plt.plot(one_perc_kl_div[:, j],
                         label=str(split_percentages[i]) + append,
                         alpha=0.3)
            plt.legend(bbox_to_anchor=(0.3, 0.3),
                       bbox_transform=plt.gcf().transFigure,
                       numpoints=1)
            plt.savefig(diretory +
                        '/kl_div_%s.png' % str(split_percentages[i]))

    plt.close('all')

    print(diretory)
Esempio n. 8
0
        #task_num=task_size,
    )
    algo.init_opt()

    from rllab.sampler import parallel_sampler
    parallel_sampler.initialize(n_parallel=num_parallel)
    parallel_sampler.set_seed(0)

    algo.start_worker()

    for i in range(initialize_epochs):
        print('------ Iter ', i, ' in Init Training ', diretory, '--------')
        paths = algo.sampler.obtain_samples(0)
        samples_data = algo.sampler.process_samples(0, paths)
        opt_data = algo.optimize_policy(0, samples_data)
        pol_aft = (policy.get_param_values())
        print(algo.mean_kl(samples_data))
        print(dict(logger._tabular)['AverageReturn'])

    data_perc_list = [0.999, 0.7, 0.5, 0.3, 0.1, 0.05, 0.01]

    testpaths = algo.sampler.obtain_samples(0)
    for perc in data_perc_list:
        sampnum = int(batch_size * perc)
        grads = []
        for i in range(var_test_time):
            idx = np.random.choice(len(testpaths), len(testpaths))
            algo.sampler.process_samples(0, testpaths)
            selected_paths = []
            current_sample_num = 0
            for id in idx:
Esempio n. 9
0
		extra_dims=1
	)
	d_rewards_var = TT.vector('d_rewards')
	importance_weights_var = TT.vector('importance_weight')

	# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
	# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
	dist_info_vars = policy.dist_info_sym(observations_var)
	snap_dist_info_vars = snap_policy.dist_info_sym(observations_var)

	surr = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var)

	params = policy.get_params(trainable=True)
	snap_params = snap_policy.get_params(trainable=True)
	# save initial parameters
	policy_parameters = policy.get_param_values(trainable=True)

	importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(actions_var, dist_info_vars, snap_dist_info_vars)
	grad = theano.grad(surr, params)

	eval_grad1 = TT.matrix('eval_grad0',dtype=grad[0].dtype)
	eval_grad2 = TT.vector('eval_grad1',dtype=grad[1].dtype)
	eval_grad3 = TT.matrix('eval_grad3',dtype=grad[2].dtype)
	eval_grad4 = TT.vector('eval_grad4',dtype=grad[3].dtype)
	eval_grad5 = TT.matrix('eval_grad5',dtype=grad[4].dtype)
	eval_grad6 = TT.vector('eval_grad6',dtype=grad[5].dtype)

	surr_on1 = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var)
	#surr_on2 = TT.sum(- snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) * d_rewards_var )
	#grad_SVRG =[sum(x) for x in zip([eval_grad1, eval_grad2, eval_grad3, eval_grad4],
	#									theano.grad(surr_on1, params),
Esempio n. 10
0
        if not load_init_policy:
            for i in range(initialize_epochs):
                paths = algo.sampler.obtain_samples(0)
                # if not split
                samples_data = algo.sampler.process_samples(0, paths)
                opt_data = algo.optimize_policy(0, samples_data)
                print(dict(logger._tabular)['AverageReturn'])
            joblib.dump(policy,
                        'data/trained/gradient_temp/rl_split_' + append +
                        '/init_policy.pkl',
                        compress=True)

        print('------- initial training complete ---------------')

        init_param_value = np.copy(policy.get_param_values())

        task_grads = []
        for i in range(2):
            task_grads.append([])

        if not load_split_data:
            split_data = []
            net_weights = []
            for i in range(grad_epochs):
                cur_param_val = np.copy(policy.get_param_values())
                cur_param = copy.deepcopy(policy.get_params())

                cp = []
                for param in policy._mean_network.get_params():
                    cp.append(np.copy(param.get_value()))
Esempio n. 11
0
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(10, 5),
        # append_dim=2,
        net_mode=0,
    )

    policy = joblib.load(
        'data/local/experiment/hopper_footstrength_rest1_sd4_boundedrandwalk_2000finish/policy_1500.pkl'
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)

    init_param = policy.get_param_values()

    ###### get baseline gradient ###################################
    '''algobase = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2000000,
        max_path_length=env.horizon,
        n_itr=5,

        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
    )
Esempio n. 12
0
alla = []
alla2 = []
alla3 = []
for k in range(10):
    alla4 = []
    if (load_policy):
        snap_policy.set_param_values(np.loadtxt('policy_novar.txt'),
                                     trainable=True)
        policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True)
    avg_return = np.zeros(s_tot)
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    j = 0
    while j < s_tot - N:
        paths = parallel_sampler.sample_paths_on_trajectories(
            snap_policy.get_param_values(), N, T, show_bar=False)
        #baseline.fit(paths)
        j += N
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        temp = list()
        for x in d_rewards:
            z = list()
            t = 1
            for y in x:
                z.append(y * t)
                t *= discount
            temp.append(np.array(z))
        d_rewards = temp
        s_g = f_train(observations[0], actions[0], d_rewards[0])
Esempio n. 13
0
def train(env, policy, policy_init, num_episodes, episode_cap, horizon,
          **alg_args):

    # Getting the environment
    env_class = rllab_env_from_name(env)
    env = normalize(env_class())

    # Policy initialization
    if policy_init == 'zeros':
        initializer = LI.Constant(0)
    elif policy_init == 'normal':
        initializer = LI.Normal()
    else:
        raise Exception('Unrecognized policy initialization.')

    # Setting the policy type
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')

    # Creating the policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=initializer,
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network,
        log_weights=True,
    )

    # Creating baseline
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # Adding max_episodes constraint. If -1, this is unbounded
    if episode_cap:
        alg_args['max_episodes'] = num_episodes

    # Run algorithm
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=horizon * num_episodes,
                whole_paths=True,
                max_path_length=horizon,
                **alg_args)
    algo.train()

    print('----- ENDING ------')
    print(policy.get_param_values())
Esempio n. 14
0
    d_rewards_var = TT.vector('d_rewards')
    importance_weights_var = TT.vector('importance_weight')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)
    snap_dist_info_vars = snap_policy.dist_info_sym(observations_var)

    surr = TT.sum(
        -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
        d_rewards_var)

    params = policy.get_params(trainable=True)
    snap_params = snap_policy.get_params(trainable=True)
    # save initial parameters
    policy_parameters = policy.get_param_values(trainable=True)

    importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(
        actions_var, dist_info_vars, snap_dist_info_vars)
    grad = theano.grad(surr, params)

    eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype)
    eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype)
    eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype)
    eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype)

    surr_on1 = TT.sum(
        -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
        d_rewards_var)
    #surr_on2 = TT.sum(- snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) * d_rewards_var )
    #grad_SVRG =[sum(x) for x in zip([eval_grad1, eval_grad2, eval_grad3, eval_grad4],
Esempio n. 15
0
                        paths += algo.sampler.obtain_samples(0, t)
                        reward_paths += algo.sampler.obtain_samples(0)
                elif imbalance_sample:
                    paths = []
                    reward_paths = []
                    for t in range(task_size):
                        algo.batch_size = batch_size * sample_ratio[t]
                        task_path = algo.sampler.obtain_samples(0, t)
                        paths += task_path
                        if t == 0:
                            reward_paths += task_path
                else:
                    paths = algo.sampler.obtain_samples(0)
                samples_data = algo.sampler.process_samples(0, paths)
                opt_data = algo.optimize_policy(0, samples_data)
                pol_aft = (policy.get_param_values())
                print(algo.mean_kl(samples_data))

                print(dict(logger._tabular)['AverageReturn'])
            joblib.dump(policy, diretory + '/init_policy.pkl', compress=True)

        print('------- initial training complete ---------------')

        init_param_value = np.copy(policy.get_param_values())

        task_grads = []
        for i in range(task_size):
            task_grads.append([])

        if not load_split_data:
            split_data = []
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=75000,
        max_path_length=env.horizon,
        n_itr=5,
        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
    )

    algo.init_opt()
    if not load_path_from_file:
        init_param = policy.get_param_values()
        init_param_obj = copy.deepcopy(policy.get_params())

        from rllab.sampler import parallel_sampler

        parallel_sampler.initialize(n_parallel=7)

        env.wrapped_env.env.env.perturb_MP = False
        algo.start_worker()

        pol_weights = []
        all_paths = []
        policy_params = []
        for i in range(50):
            init_param = policy.get_param_values()
            init_param_obj = copy.deepcopy(policy.get_params())
Esempio n. 17
0
    actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
    d_rewards_var = TT.vector('d_rewards')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions.
    # For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # negate the objective for minimization problem
    surr = TT.sum(
        -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
        d_rewards_var)
    # get the list of trainable parameters
    params = policy.get_params(trainable=True)
    # save initial parameters
    policy_parameters = policy.get_param_values(trainable=True)
    grad = theano.grad(surr, params)

    eval_grad1 = TT.matrix(
        'eval_grad0', dtype=grad[0].dtype
    )  # (4, 8) hiddenlayer.w = LI.GlorotUniform() aka Xavier Uniform Init
    eval_grad2 = TT.vector(
        'eval_grad1',
        dtype=grad[1].dtype)  # (8, )  hiddenlayer.b = LI.Constant(0.),
    eval_grad3 = TT.col(
        'eval_grad3',
        dtype=grad[2].dtype)  # (8, 1) output.w = LI.GlorotUniform(),
    eval_grad4 = TT.vector(
        'eval_grad4',
        dtype=grad[3].dtype)  # (1, )  output.b = LI.Constant(0.),
for k in range(10):
    if (load_policy):
        snap_policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True)
        policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True)
    avg_return = list()
    n_sub_iter=[]
    rewards_sub_iter=[]
    rewards_snapshot=[]
    importance_weights=[]
    variance_svrg = []
    variance_sgd = []

    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    j=0
    while j<s_tot-N:
        paths = parallel_sampler.sample_paths_on_trajectories(snap_policy.get_param_values(),N,T,show_bar=False)
        #baseline.fit(paths)
        paths = paths[:N]
        j+=N
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        temp = list()
        for x in d_rewards:
            z=list()
            t=1
            for y in x:
                z.append(y*t)
                t*=discount
            temp.append(np.array(z))
        d_rewards=temp
for k in range(10):

    print("Run #{}".format(k))

    # load policy
    if learn_std:
        file_name = 'roboschool_inv_pendulum_policy' + '.txt'
    else:
        file_name = 'roboschool_inv_pendulum_policy_novar' + '.txt'

    if load_policy:
        policy.set_param_values(np.loadtxt('save_model/' + file_name),
                                trainable=True)
    else:
        np.savetxt("save_model/" + file_name,
                   policy.get_param_values(trainable=True))
        load_policy = True

    # intial setup
    avg_return = list()
    eps_list = []
    max_rewards = -np.inf
    num_traj = 0

    # loop till done
    while num_traj <= max_num_traj:
        # sample snapshot batch of trajectories
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), snap_bs, traj_length, show_bar=False)
        paths = paths[:snap_bs]
        policy=policy,
        baseline=baseline,
        batch_size=150000,
        max_path_length=env.horizon,
        n_itr=5,
        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
    )

    algo.init_opt()

    one_iter_grad = []
    mps = []
    if not load_path_from_file:
        init_param = policy.get_param_values()
        init_param_obj = copy.deepcopy(policy.get_params())

        from rllab.sampler import parallel_sampler

        parallel_sampler.initialize(n_parallel=7)

        env.wrapped_env.env.env.perturb_MP = False
        pol_weights = []
        all_paths = []
        policy_params = []
        init_param = np.copy(policy.get_param_values())
        algo.start_worker()
        for i in range(100):
            policy.set_param_values(init_param)
            #####   get data ###################
variance_sgd_data = {}
importance_weights_data = {}
rewards_snapshot_data = {}
rewards_subiter_data = {}
n_sub_iter_data = {}
diff_lr_data = {}
alfa_t_data = {}
parallel_sampler.initialize(10)
for k in range(10):
    if (load_policy):
        snap_policy.set_param_values(np.loadtxt('policy_swimmer.txt'),
                                     trainable=True)
        policy.set_param_values(np.loadtxt('policy_swimmer.txt'),
                                trainable=True)
    else:
        policy.set_param_values(snap_policy.get_param_values(trainable=True),
                                trainable=True)
    avg_return = []
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    n_sub_iter = []
    rewards_sub_iter = []
    rewards_snapshot = []
    importance_weights = []
    variance_svrg = []
    variance_sgd = []
    diff_lr = []
    alfa_t = []
    j = 0
    while j < s_tot - N:
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), N, T, show_bar=False)
Esempio n. 22
0
all_policy_param_data = {}
ar_data = {}
parallel_sampler.initialize(4)
for k in range(5):
    if (load_policy):
        #        policy.set_param_values(np.loadtxt('policy.txt'), trainable=True)
        policy.set_param_values(np.loadtxt('pcb' + np.str(k + 1) + '.txt'),
                                trainable=True)
    avg_return = np.zeros(n_itr)
    rewards = []
    all_policy_param = []
    all_rew = []
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    for j in range(n_itr):
        if (j % 100 == 0):
            all_policy_param.append(policy.get_param_values())
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), N, T, show_bar=False)
        paths = paths[:N]
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        rewards.append(np.array([sum(p["rewards"]) for p in paths]))
        temp = list()
        for x in d_rewards:
            z = list()
            t = 1
            for y in x:
                z.append(y * t)
                t *= discount
            temp.append(np.array(z))
Esempio n. 23
0
f_update = theano.function(
    inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
    outputs=None,
    updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
                params,
                learning_rate=learning_rate))

alla = []
for i in range(10):
    if (load_policy):
        policy.set_param_values(np.loadtxt('policy.txt'), trainable=True)
    avg_return = np.zeros(n_itr)
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    for j in range(n_itr):
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), N, T, show_bar=False)
        #baseline.fit(paths)
        observations = [p["observations"] for p in paths]
        actions = [p["actions"] for p in paths]
        d_rewards = [p["rewards"] for p in paths]
        temp = list()
        for x in d_rewards:
            z = list()
            t = 1
            for y in x:
                z.append(y * t)
                t *= discount
            temp.append(np.array(z))
        d_rewards = temp
        s_g = f_train(observations[0], actions[0], d_rewards[0])
        for ob, ac, rw in zip(observations[1:], actions[1:], d_rewards[1:]):
importance_weights_data = {}
rewards_snapshot_data = {}
rewards_subiter_data = {}
n_sub_iter_data = {}
all_policy_param_data = {}
parallel_sampler.initialize(4)
for k in range(10):
    if (load_policy):
        #        snap_policy.set_param_values(np.loadtxt('policy.txt'), trainable=True)
        #        policy.set_param_values(np.loadtxt('policy.txt'), trainable=True)
        snap_policy.set_param_values(np.loadtxt('pc' + np.str(k + 1) + '.txt'),
                                     trainable=True)
        policy.set_param_values(np.loadtxt('pc' + np.str(k + 1) + '.txt'),
                                trainable=True)
    else:
        policy.set_param_values(snap_policy.get_param_values(trainable=True),
                                trainable=True)
    avg_return = np.zeros(s_tot)
    #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
    n_sub_iter = []
    rewards_sub_iter = []
    rewards_snapshot = []
    importance_weights = []
    variance_svrg = []
    variance_sgd = []
    all_policy_param = []
    j = 0
    while j < s_tot - N:
        all_policy_param.append(policy.get_param_values())
        paths = parallel_sampler.sample_paths_on_trajectories(
            policy.get_param_values(), N, T, show_bar=False)