Esempi in Python per TRPO.TRPO, esempi in Python per rllab.algos.trpo.TRPO.TRPO

Esempio n. 1

0

Mostra file

File: baselines_wheeled.py Progetto: yuanying-cc/Sectar

def run_trpo(vv):
    setup_rllab_logging(vv)
    path_len = vv['path_len']
    env = get_env(vv)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(300, 200, 100),
        init_std=20.0,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100 * path_len,
        max_path_length=path_len,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
    )
    algo.train()

Esempio n. 2

0

Mostra file

File: arm3d_key_vanilla_trpo.py Progetto: shenghuanjie/dcl

def run_task(*_):
    env = normalize(Arm3dKeyEnv())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=500,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    algo.train()

Esempio n. 3

0

Mostra file

def run_task(*_):
    # env = normalize(SwimmerWrapperGym('Swimmer-v1'))
    env = normalize(GymEnv('Swimmer-v1'))
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
        learn_std=True)

    print('horizon {}'.format(env.horizon))
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.99,
        step_size=0.01,
    )
    algo.train()

Esempio n. 4

0

Mostra file

def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=5,
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        exp_prefix="grid_world_silent",
        variant=params,
    )

Esempio n. 5

0

Mostra file

def main(args):
    logger.set_snapshot_dir(args.snapshot_dir)
    logger.set_snapshot_mode("none")
    logger.add_tabular_output(os.path.join(args.snapshot_dir, "tabular.csv"))
    env = GymEnv(args.env_id)

    # If the user provided a starting policy, use it. Otherwise, we start with
    # a fresh policy.
    if args.input_policy is not None:
        with open(args.input_policy, "rb") as f:
            policy = pickle.load(f)
    else:
        policy = CategoricalMLPPolicy(env_spec=env.spec,
                                      hidden_sizes=args.hidden_sizes)

        # policy = CategoricalMLPPolicy(
        #     env_spec=env.spec,
        #     hidden_sizes=(16, 16),
        #     hidden_nonlinearity=lasagne.nonlinearities.rectify)

        # policy = CategoricalGRUPolicy(env_spec=env.spec)

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=env.horizon,
        n_itr=args.n_itr,
        discount=args.discount,
        step_size=args.step_size,
        gae_lambda=args.gae_lambda,
    )
    algo.train()
    with open(args.output_policy, "wb") as f:
        pickle.dump(policy, f)

Esempio n. 6

0

Mostra file

def switch_from_slide(exp_name="Switch_Slide", num=1, directory="./Results/Car/SlideTurn/", save=True):
    rccar = RCCarSlideTurn(noise=0.1)
    env =  RLPyEnv(rccar)
    now = datetime.datetime.now()
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
    import joblib
    data = joblib.load("Results/Car/Slide/Base/params.pkl") # LOAD POLICY
    policy = data['policy']
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=100,
        discount=0.9,
        step_size=0.01,
        # plot=True,
    )
    # algo.train()
    # rollout(env, policy)

    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        exp_name=exp_name + timestamp,
        log_dir=os.path.join(directory, exp_name) if save else './Results/Tmp',
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=1,
        # plot=True,
    )

Esempio n. 7

0

Mostra file

File: rl_odin_trpo.py Progetto: jonasnm/rllab

        def run_task(*_):
            env = normalize(GymEnv(models[k]))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            # hidden_sizes=(100, 50, 25)

            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=hidden_sizes,
                                       learn_std=learn_std,
                                       init_std=init_std)

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
            algo.train()

Esempio n. 8

0

Mostra file

File: trpo_gym.py Progetto: kpeeters14/rl_gcg

def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0", record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(8, 8)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()

Esempio n. 9

0

Mostra file

File: gradient_experiment.py Progetto: parthchadha/metarl

def train_domain(domain):
    rc = domain()
    env = RLPyEnv(rc)
    # env = ControllerEnv(k=10)
    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(
            16,
            16,
        ),
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=100,
        discount=0.995,
        step_size=0.01,
        plot=False,
    )
    run_experiment_lite(
        algo.train(),
        # Number of parallel workers for sampling
        n_parallel=4,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        script="scripts/run_experiment_lite_rl.py",
        # script="scripts/run_experiment_lite.py",
        log_dir="models/rc_gradient/" + domain.proxy_class.__name__,
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        # plot=True,
    )

Esempio n. 10

0

Mostra file

def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    #baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2])

    # can also filter these starts optionally

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    # hardest to easiest
    init_pos = [[0, 0],
                [1, 0],
                [2, 0],
                [3, 0],
                [4, 0],
                [4, 1],
                [4, 2],
                [4, 3],
                [4, 4],
                [3, 4],
                [2, 4],
                [1, 4]
                ][::-1]
    for pos in init_pos:
        pos.extend([0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ])
    array_init_pos = np.array(init_pos)
    init_pos = [tuple(pos) for pos in init_pos]
    online_start_generator = Online_TCSL(init_pos)


    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        # generate starts from the previous seed starts, which are defined below
        dist = online_start_generator.get_distribution() # added
        logger.log(np.array_str(online_start_generator.get_q()))
        # how to log Q values?
        # with logger.tabular_prefix("General: "):
        #     logger.record_tabular("Q values:", online_start_generator.get_q())
        logger.log(np.array_str(dist))

        # Following code should be indented
        with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            #TODO: might be faster to sample if we just create a roughly representative UniformListStateGenerator?
            env.update_start_generator(
                ListStateGenerator(
                    init_pos, dist
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()



        logger.log("Labeling the starts")
        [starts, labels, mean_rewards, updated] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached',  # using the min n_traj
                                                   as_goal=False, env=env, return_mean_rewards=True, order_of_states=init_pos)

        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        online_start_generator.update_q(np.array(mean_rewards), np.array(updated)) # added
        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:  # add a ton of noise if all the states I had ended up being high_reward!
                logger.log("We have {} good starts!".format(len(filtered_raw_starts)))
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                logger.log("More bad starts than good starts, sampling seeds from replay buffer")
                seed_starts = all_starts.sample(300)  # sample them from the replay
            else:
                logger.log("More good starts than bad starts, resampling")
                seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000,
                                              variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
        else:
            raise Exception

        all_starts.append(filtered_raw_starts)

        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached',
                                                 as_goals=False, full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
                                    improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                                center=v['goal_center'], maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            # report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(array_init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached',
                                                 as_goals=False, full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
                                    improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(array_init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                                center=v['goal_center'], maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()

Esempio n. 11

0

Mostra file

nonED = True

if(nonED):
	# logger.add_tabular_output('./NonED.log')
	env = NonEDFirestorm_SingleAgent_Env()
	discount = env.discount
	env = normalize(env)
	policy = CategoricalMLPPolicy(
	    env_spec=env.spec,
	)

	baseline = LinearFeatureBaseline(env_spec=env.spec)
	algo = TRPO(
	    env=env,
	    policy=policy,
	    baseline=baseline,
	    discount=discount,
	    n_itr=75
	)
else:
	logger.add_tabular_output('./ED.log')
	env = normalize(EDFirestorm_SingleAgent_Env())
	policy = CategoricalMLPPolicy(
	    env_spec=env.spec,
	)

	baseline = LinearFeatureBaseline(env_spec=env.spec)
	algo = TRPO(
	    env=env,
	    policy=policy,
	    baseline=baseline,

Esempio n. 12

0

Mostra file

File: trpo_inverted_pendulum.py Progetto: StanfordVL/ARPL

        random=RANDOM,
        observable_noise=False,
        zero_gradient_cutoff=
        org_env_size,  # zero out gradients except for config params
        use_max_norm=MAX_NORM,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=5000,
        max_path_length=env.horizon,
        n_itr=NUM_ITERS,
        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
        sampler_args={'n_workers': 2},
        plot_learning_curve=GENERATE_PLOTS,
        trial=trial,
    )
    avg_rewards, std_rewards = algo.train()

    print('trial {}'.format(trial))
    saveModel(
        algo.policy,
        'policy_{}_{}_{}_{}_{}_{}_{}_{}'.format(ENV_NAME, TRAIN_ADVERSARIAL,
                                                NUM_ITERS, PROBABILITY, EPS,
                                                MAX_NORM, USE_DYNAMICS, trial))

Esempio n. 13

0

Mostra file

File: train_trpo_curriculum_multiproc.py Progetto: StanfordVL/ARPL

def train(env_ind, config_num, num_agents):

    # get the original state space size first
    org_env = GymEnv(original_environments[env_ind])
    org_env_size = org_env.observation_space.shape[0]
    org_env.terminate()

    # the environment
    env = GymEnv(dynamic_environments[env_ind])

    # the configuration settings
    curriculum_config = curriculum_configs[config_num]

    if args.env_ind == 0:
        # batch size for Inverted Pendulum
        curriculum_config.set_batch_size(5000)
    else:
        # batch size for all other environments
        curriculum_config.set_batch_size(25000)

    # the nominal config
    config = curriculum_config.curriculum_list[0]

    for agent_num in range(num_agents):

        # define policy by reading from config class
        policy = CurriculumPolicy(
            env_spec=env.spec,
            hidden_sizes=config.hidden_sizes,
            adaptive_std=config.adaptive_std,
            adversarial=config.adversarial,
            eps=config.eps,
            probability=config.probability,
            use_dynamics=config.use_dynamics,
            random=config.random,
            observable_noise=config.observable_noise,
            zero_gradient_cutoff=org_env_size,
            use_max_norm=config.use_max_norm,
            curriculum_list=list(curriculum_config.curriculum_list),
            update_freq=curriculum_config.update_freq,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=config.batch_size,
            max_path_length=env.horizon,
            n_itr=config.num_iter,
            discount=config.discount,
            step_size=config.step_size,
            gae_lambda=config.gae_lambda,
            num_workers=config.num_workers,
            plot_learning_curve=config.plot_learning_curve,
            trial=agent_num,
        )
        avg_rewards, std_rewards = algo.train()

        print("training completed!")
        saveModel(
            algo.policy, 'policy_{}_config_{}_agent_{}'.format(
                dynamic_environments[env_ind], config_num, agent_num))

        # save rewards per model over the iterations
        if config.plot_learning_curve:
            saveModel([range(config.num_iter), avg_rewards, std_rewards],
                      'rewards_{}_config_{}_agent_{}'.format(
                          dynamic_environments[env_ind], config_num,
                          agent_num))

Esempio n. 14

0

Mostra file

env = (GymEnv("ClothEnv-v0"))

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8, 8))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=env.horizon,
    n_itr=50,
    discount=0.99,
    step_size=0.01,
    # Uncomment both lines (this and the plot parameter below) to enable plotting
    # plot=True,
)
algo.train()
# run_experiment_lite(
#     algo.train(),
#     # Number of parallel workers for sampling
#     n_parallel=1,
#     # Only keep the snapshot parameters for the last iteration
#     snapshot_mode="last",
#     # Specifies the seed for the experiment. If this is not provided, a random seed
#     # will be used
#     seed=1,

Esempio n. 15

0

Mostra file

def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']
    samples_per_cell = 10  # for the oracle rejection sampling

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['constant_baseline']:
        logger.log("Using constant baseline")
        baseline = ConstantBaseline(env_spec=env.spec, value=1.0)
    else:
        logger.log("Using linear baseline")
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center'])
    test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter, report=report, center=v['goal_center'],
                         limit=v['goal_range'])  # use goal for plot
    report.new_row()

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'],
                                 horizon=v['brownian_horizon'], variance=v['brownian_variance'])
        labels = label_states(starts, env, policy, v['horizon'],
                              as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'],
                            summary_string_base='initial starts labels:\n')
        report.save()

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
                                                       as_goal=False, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'],
                                         key='goal_reached', full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        logger.log('Generating the Heatmap...')
        plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center'])
        test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range'])

        logger.log("Labeling the starts")
        #labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        start_classes, text_labels = convert_label(labels)

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]
        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:  # add a tone of noise if all the states I had ended up being high_reward!
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(300)  # sample them from the replay
            else:
                seed_starts = generate_starts(env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'],
                                                  variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])

Esempio n. 16

0

Mostra file

File: arm3d_disc_selfplay_algo.py Progetto: shenghuanjie/dcl

def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))
    report.save()

    inner_env = normalize(Arm3dDiscEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'disc_all_feasible_states_min.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    # brownian_starts = StateCollection(distance_threshold=v['regularize_starts'])
    # with env.set_kill_outside():
    #     seed_starts = generate_starts(env, starts=[v['start_goal']], horizon=10,  # this is smaller as they are seeds!
    #                                   variance=v['brownian_variance'], subsample=v['num_new_starts'])  # , animated=True, speedup=1)
    #
    # with env.set_kill_outside():
    #     find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10)

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    env_alice = AliceEnv(env, env, policy, v['horizon'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        # with env.set_kill_outside():
        #     starts = generate_starts(env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance'])

        # regularization of the brownian starts
        # brownian_starts.empty()
        # brownian_starts.append(starts)
        # starts = brownian_starts.sample(size=v['num_new_starts'])

        starts = generate_starts_alice(env_bob=env,
                                       env_alice=env_alice,
                                       policy_bob=policy,
                                       policy_alice=policy_alice,
                                       algo_alice=algo_alice,
                                       start_states=[v['start_goal']],
                                       num_new_starts=v['num_new_starts'],
                                       alice_factor=v['alice_factor'],
                                       log_dir=log_dir)

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        logger.record_tabular('starts', starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(1000)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        logger.log("Appending good goals to replay and generating seeds")
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_starts)

Esempio n. 17

0

Mostra file

File: maze_ant_gan_algo.py Progetto: shenghuanjie/dcl

def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(AntMazeEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env, goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                         itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['goal_size'],
        evaluater_size=v['num_labels'],
        state_range=v['goal_range'],
        state_center=v['goal_center'],
        state_noise_level=v['goal_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )
    logger.log("pretraining the GAN...")
    if v['smart_init']:
        feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                horizon=v['horizon'])
        labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter,
                            limit=v['goal_range'], center=v['goal_center'])

        dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=v['gan_outer_iters'])
        print("Loss of Gen and Dis: ", gen_loss, dis_loss)
    else:
        gan.pretrain_uniform()

    # log first samples form the GAN
    initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

    logger.log("Labeling the goals")
    labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

    plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter,
                        limit=v['goal_range'], center=v['goal_center'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        # Sample GAN
        logger.log("Sampling goals from the GAN")
        raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(v['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals

        # if needed label the goals before any update
        if v['label_with_variation']:
            old_labels, old_rewards = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                                   key='goal_reached', full_path=False, return_rew=True)

        # itr_label = outer_iter  # use outer_iter to log everything or "last" to log only the last
        # with ExperimentLogger(log_dir, itr_label, snapshot_mode='last', hold_outter_log=True):
        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [goals, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
                                                       as_goal=True, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        elif v['label_with_variation']:
            labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                         key='goal_reached', old_rewards=old_rewards, full_path=True)
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                         key='goal_reached', full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                             itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'])

        #logger.log("Labeling the goals")
        #labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        if v['label_with_variation']:  # this will use only the performance variation for labeling
            labels = np.array(labels[:, -1], dtype=int).reshape((-1, 1))
        else:
            labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Training the GAN")
        gan.train(
            goals, labels,
            v['gan_outer_iters'],
        )

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1]
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                    horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)

Esempio n. 18

0

Mostra file

def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    V_ENTER = 30
    INNER_LENGTH = 300
    LONG_LENGTH = 100
    SHORT_LENGTH = 300
    N_ROWS = 3
    N_COLUMNS = 3
    NUM_CARS_LEFT = 1
    NUM_CARS_RIGHT = 1
    NUM_CARS_TOP = 1
    NUM_CARS_BOT = 1
    tot_cars = (NUM_CARS_LEFT + NUM_CARS_RIGHT) * N_COLUMNS \
        + (NUM_CARS_BOT + NUM_CARS_TOP) * N_ROWS

    grid_array = {
        "short_length": SHORT_LENGTH,
        "inner_length": INNER_LENGTH,
        "long_length": LONG_LENGTH,
        "row_num": N_ROWS,
        "col_num": N_COLUMNS,
        "cars_left": NUM_CARS_LEFT,
        "cars_right": NUM_CARS_RIGHT,
        "cars_top": NUM_CARS_TOP,
        "cars_bot": NUM_CARS_BOT
    }

    sim_params = SumoParams(sim_step=1, render=True)

    vehicles = VehicleParams()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SimCarFollowingController, {}),
                 car_following_params=SumoCarFollowingParams(
                     min_gap=2.5,
                     tau=1.1,
                     max_speed=V_ENTER,
                     speed_mode="all_checks"),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars)

    tl_logic = TrafficLightParams(baseline=False)

    additional_env_params = {
        "target_velocity": 50,
        "switch_time": 3.0,
        "num_observed": 2,
        "discrete": False,
        "tl_type": "controlled"
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1
    }

    if USE_INFLOWS:
        initial_config, net_params = get_flow_params(
            v_enter=V_ENTER,
            vehs_per_hour=EDGE_INFLOW,
            col_num=N_COLUMNS,
            row_num=N_ROWS,
            add_net_params=additional_net_params)
    else:
        initial_config, net_params = get_non_flow_params(
            V_ENTER, additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config,
                                  traffic_lights=tl_logic)

    env_name = "PO_TrafficLightGridEnv"
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()

Esempio n. 19

0

Mostra file

File: transfer.py Progetto: x00353666/HAAR-A-Hierarchical-RL-Algorithm

 algo = TRPO(
     env=env,
     policy=policy,
     baseline=baseline,
     baselinename=par.baseline_name,
     self_normalize=True,
     log_deterministic=True,
     batch_size=batch_size,
     whole_paths=True,
     max_path_length=max_path_length,
     n_itr=par.n_itr,
     discount=0.99,  # not used
     discount_low=0.99,
     discount_high=par.
     discount_high,  # 0.99 prev, Rui wants to change it to 0.8 (for time_step_agg == 100)
     train_high_every=par.
     train_high_every,  # 1 prev, Rui wants to change it to 10 (for time_step_agg == 100)
     step_size=0.01,
     train_low=par.train_low,
     train_high=par.train_high,
     train_low_with_penalty=par.train_low_with_penalty,
     train_low_with_v_split=par.train_low_with_v_split,
     train_low_with_v_gradient=par.train_low_with_v_gradient,
     train_low_with_external=par.train_low_with_external,
     time_step_agg_anneal=par.time_step_agg_anneal,
     anneal_base_number=par.anneal_base_number,
     total_low_step=par.low_step_num,
     episode_max_low_step=par.max_low_step,
     low_level_entropy_penalty=par.low_level_entropy_penalty,
     itr_delay=par.itr_delay,
     transfer=par.transfer,
     transfer_high=par.transfer_high,
     warm_path=par.warm_path,
 )

Esempio n. 20

0

Mostra file

from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy
from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = normalize(CartpoleEnv())

policy = GaussianGRUPolicy(env_spec=env.spec, )

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            step_size=0.01,
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    seed=1,
)

Esempio n. 21

0

Mostra file

def train(env, policy, policy_init, num_episodes, episode_cap, horizon,
          **alg_args):

    # Getting the environment
    env_class = rllab_env_from_name(env)
    env = normalize(env_class())

    # Policy initialization
    if policy_init == 'zeros':
        initializer = LI.Constant(0)
    elif policy_init == 'normal':
        initializer = LI.Normal()
    else:
        raise Exception('Unrecognized policy initialization.')

    # Setting the policy type
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')

    # Creating the policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=initializer,
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network,
        log_weights=True,
    )

    # Creating baseline
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # Adding max_episodes constraint. If -1, this is unbounded
    if episode_cap:
        alg_args['max_episodes'] = num_episodes

    # Run algorithm
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=horizon * num_episodes,
                whole_paths=True,
                max_path_length=horizon,
                **alg_args)
    algo.train()

    print('----- ENDING ------')
    print(policy.get_param_values())

Esempio n. 22

0

Mostra file

cart_env = normalize(CartpoleEnv())
env = GaussianNoiseEnv(cart_env, sigma=0.2)

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=40,
            discount=0.99,
            step_size=0.01,
            plot=True)

run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    plot=True,

Esempio n. 23

0

Mostra file

from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from custom_env.reacher_mod import ReacherEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

env = normalize(ReacherEnv())

policy = GaussianMLPPolicy(env_spec=env.spec, )

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
)
algo.train()

observation = env.reset()
for _ in range(5000):
    env.render()
    action, _ = policy.get_action(observation)
    observation, reward, terminal, _ = env.step(action)
    if terminal:
        break

Esempio n. 24

0

Mostra file

File: trpois_cartpole.py Progetto: x00353666/HAAR-A-Hierarchical-RL-Algorithm

baseline = LinearFeatureBaseline(env_spec=env.spec)
# baseline = GaussianMLPBaseline(env_spec=env.spec)

optimizer_args = dict(
    # debug_nan=True,
    # reg_coeff=0.1,
    # cg_iters=2
)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=200,
    discount=0.99,
    step_size=0.01,
    sampler_cls=ISSampler,
    sampler_args=dict(n_backtrack=1),
    optimizer_args=optimizer_args
)
# algo.train()
run_experiment_lite(
                stub_method_call=algo.train(),
                mode="local",
                use_cloudpickle=False,
                pre_commands=['pip install --upgrade pip',
                              'pip install --upgrade theano',
                              ],
                # Number of parallel workers for sampling

Esempio n. 25

0

Mostra file

def run_task(*_):
    sumo_params = SumoParams(sim_step=0.2, sumo_binary="sumo-gui")

    # note that the vehicles are added sequentially by the generator,
    # so place the merging vehicles after the vehicles in the ring
    vehicles = Vehicles()
    # Inner ring vehicles
    vehicles.add(veh_id="human",
                 acceleration_controller=(IDMController, {
                     "noise": 0.2
                 }),
                 lane_change_controller=(SumoLaneChangeController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=6,
                 sumo_car_following_params=SumoCarFollowingParams(minGap=0.0,
                                                                  tau=0.5),
                 sumo_lc_params=SumoLaneChangeParams())

    # A single learning agent in the inner ring
    vehicles.add(veh_id="rl",
                 acceleration_controller=(RLController, {}),
                 lane_change_controller=(SumoLaneChangeController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 speed_mode="no_collide",
                 num_vehicles=1,
                 sumo_car_following_params=SumoCarFollowingParams(minGap=0.01,
                                                                  tau=0.5),
                 sumo_lc_params=SumoLaneChangeParams())

    # Outer ring vehicles
    vehicles.add(veh_id="merge-human",
                 acceleration_controller=(IDMController, {
                     "noise": 0.2
                 }),
                 lane_change_controller=(SumoLaneChangeController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=10,
                 sumo_car_following_params=SumoCarFollowingParams(minGap=0.0,
                                                                  tau=0.5),
                 sumo_lc_params=SumoLaneChangeParams())

    env_params = EnvParams(horizon=HORIZON,
                           additional_params={
                               "max_accel": 3,
                               "max_decel": 3,
                               "target_velocity": 10,
                               "n_preceding": 2,
                               "n_following": 2,
                               "n_merging_in": 2,
                           })

    additional_net_params = ADDITIONAL_NET_PARAMS.copy()
    additional_net_params["ring_radius"] = 50
    additional_net_params["inner_lanes"] = 1
    additional_net_params["outer_lanes"] = 1
    additional_net_params["lane_length"] = 75
    net_params = NetParams(no_internal_links=False,
                           additional_params=additional_net_params)

    initial_config = InitialConfig(x0=50,
                                   spacing="uniform",
                                   additional_params={"merge_bunching": 0})

    scenario = TwoLoopsOneMergingScenario(
        name=exp_tag,
        generator_class=TwoLoopOneMergingGenerator,
        vehicles=vehicles,
        net_params=net_params,
        initial_config=initial_config)

    env_name = "TwoLoopsMergePOEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=64 * 3 * horizon,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=1000,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()

Esempio n. 26

0

Mostra file

        hidden_sizes=layer_size,
        is_protagonist=False
    )
    adv_baseline = LinearFeatureBaseline(env_spec=env.spec)

    ## Initializing the parallel sampler ##
    parallel_sampler.initialize(n_process)

    ## Optimizer for the Protagonist ##
    pro_algo = TRPO(
        env=env,
        pro_policy=pro_policy,
        adv_policy=adv_policy,
        pro_baseline=pro_baseline,
        adv_baseline=adv_baseline,
        batch_size=batch_size,
        max_path_length=path_length,
        n_itr=n_pro_itr,
        discount=0.995,
        gae_lambda=gae_lambda,
        step_size=step_size,
        is_protagonist=True
    )

    ## Optimizer for the Adversary ##
    adv_algo = TRPO(
        env=env,
        pro_policy=pro_policy,
        adv_policy=adv_policy,
        pro_baseline=pro_baseline,
        adv_baseline=adv_baseline,
        batch_size=batch_size,

Esempio n. 27

0

Mostra file

File: trpo_human.py Progetto: birdyLinch/rllab

                                   "std_hidden_sizes": (32, 32),
                                   "std_nonlinearity": None,
                                   "normalize_inputs": True,
                                   "normalize_outputs": True,
                               })

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    n_itr=3000,
    max_path_lenght=2000,
    experiment_spec=experiment_spec,
    save_policy_every=save_policy_every,
    batch_size=30000,
    discount=0.995,
    gae_lambda=0.98,
    step_size=0.01,

    # baseline
    #discriminator=None,

    # GAN imitation
    discriminator=discriminator,
)

algo.train(),

pickle.dump(policy, open("model/model1.pickle", "wb"))
# except Exception as e:
#     exc_type, exc_value, exc_traceback = sys.exc_info()

Esempio n. 28

0

Mostra file

def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']
    unif_samples = 300

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])
    total_rollouts = 0

    for outer_iter in range(1, v['outer_iters']):
        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling goals")

        goals = np.array([]).reshape((-1, v['goal_size']))
        k = 0
        while goals.shape[0] < v['num_new_goals']:
            print('good goals collected: ', goals.shape[0])
            logger.log("Sampling and labeling the goals: %d" % k)
            k += 1
            unif_goals = np.random.uniform(
                np.array(v['goal_center']) - np.array(v['goal_range']),
                np.array(v['goal_center']) + np.array(v['goal_range']),
                size=(unif_samples, v['goal_size']))
            labels = label_states(unif_goals,
                                  env,
                                  policy,
                                  v['horizon'],
                                  n_traj=v['n_traj'],
                                  key='goal_reached')
            logger.log("Converting the labels")
            init_classes, text_labels = convert_label(labels)
            goals = np.concatenate([goals,
                                    unif_goals[init_classes == 2]]).reshape(
                                        (-1, v['goal_size']))

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(
                v['num_old_goals'])  #todo: replay noise?
            goals = np.vstack([goals, old_goals])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        logger.log("labeling starts with trpo rollouts")
        [goals, labels] = label_states_from_paths(
            trpo_paths,
            n_traj=2,
            key='goal_reached',  # using the min n_traj
            as_goal=True,
            env=env)
        paths = [path for paths in trpo_paths for path in paths]
        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # rollouts used for labeling (before TRPO itrs):
        logger.record_tabular('LabelingRollouts',
                              k * v['n_traj'] * unif_samples)
        total_rollouts += k * v['n_traj'] * unif_samples
        logger.record_tabular('TotalLabelingRollouts', total_rollouts)

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]
        all_goals.append(filtered_raw_goals)

Esempio n. 29

0

Mostra file

    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32),
    init_std=10
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=25000,
    max_path_length=50,
    n_itr=1000,
    discount=0.99,
    step_size=0.01,
    # imsize=(48,48),
    name='reach',
    mode='oracle',
    exp_prefix="reacher_state",
    # force_batch_sampler=True,
    # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

)
run_experiment_lite(
    algo.train(),
    n_parallel=6,
    seed=1,
)

Esempio n. 30

0

Mostra file

def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(SwimmerMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    # all_feasible_starts = pickle.load(open(osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states_min.pkl'), 'rb'))
    # print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    # brownian_starts = StateCollection(distance_threshold=v['regularize_starts'])
    # with env.set_kill_outside():
    seed_starts = generate_starts(
        env,
        starts=[v['start_goal']],
        horizon=v['initial_brownian_horizon'],
        size=5000,  # size speeds up training a bit
        variance=v['brownian_variance'],
        subsample=v['num_new_starts'])  # , animated=True, speedup=1)
    np.random.shuffle(seed_starts)

    # with env.set_kill_outside():
    feasible_states = find_all_feasible_states_plotting(env,
                                                        seed_starts,
                                                        distance_threshold=1,
                                                        brownian_variance=1,
                                                        animate=True)

    # print("hi")
    # show where these states are:
    # shuffled_starts = np.array(seed_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=seed_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10)

    if 'gae_lambda' not in v:
        v['gae_lambda'] = 1
    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        starts = generate_starts(env,
                                 starts=seed_starts,
                                 subsample=v['num_new_starts'],
                                 size=5000,
                                 horizon=v['brownian_horizon'],
                                 variance=v['brownian_variance'])
        labels = label_states(starts,
                              env,
                              policy,
                              v['horizon'],
                              as_goals=False,
                              n_traj=v['n_traj'],
                              key='goal_reached')
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'],
                            summary_string_base='initial starts labels:\n')
        report.save()

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                gae_lambda=v['gae_lambda'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            algo.train()

        logger.log('Generating the Heatmap...')

        # policy means should not mean too much
        # plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center'])
        test_and_plot_policy(policy,
                             env,
                             as_goals=False,
                             max_reward=v['max_reward'],
                             sampling_res=1,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             center=v['goal_center'],
                             limit=v['goal_range'])

        logger.log("Labeling the starts")
        labels = label_states(starts,
                              env,
                              policy,
                              v['horizon'],
                              as_goals=False,
                              n_traj=v['n_traj'],
                              key='goal_reached')

        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        if len(
                filtered_raw_starts
        ) > 0:  # add a tone of noise if all the states I had ended up being high_reward!
            seed_starts = filtered_raw_starts
        else:
            seed_starts = generate_starts(env,
                                          starts=starts,
                                          horizon=v['horizon'] * 2,
                                          subsample=v['num_new_starts'],
                                          size=5000,
                                          variance=v['brownian_variance'] * 10)
        all_starts.append(filtered_raw_starts)