コード例 #1
0
def load_policy(load_path, model_name, obs_arg=None):
    import baselines.her.experiment.config as config
    from baselines.common import tf_util
    import tensorflow as tf

    with open(os.path.join(load_path, 'logs/params.json'), 'r') as f:
        params = json.load(f)

    clip_return = True

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        params = prepare_params(params)
        dims = config.configure_dims(params)
        if obs_arg == "25":
            dims['o'] = 25
        policy = config.configure_ddpg(dims=dims,
                                       params=params,
                                       reuse=False,
                                       clip_return=clip_return)
        if load_path is not None:
            model_path = os.path.join(load_path, 'models')
            model_filename = os.path.join(model_path, model_name)
            tf_util.load_variables(model_filename)
            print("Successfully loaded a policy.")
    return policy
コード例 #2
0
ファイル: her.py プロジェクト: buoyancy99/sap
def load_policy(seed=None,
                replay_strategy='future',
                load_path=None,
                **kwargs):
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['replay_strategy'] = replay_strategy
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    params.update(kwargs)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=True)
    if load_path is not None:
        tf_util.load_variables(load_path)
    return policy
コード例 #3
0
ファイル: utils.py プロジェクト: bentzinir/baselines
def load_policy(env_id, **kwargs):
    params = config.DEFAULT_PARAMS
    _override_params = copy.deepcopy(kwargs)
    params.update(**_override_params)
    params['env_name'] = env_id
    params = config.prepare_params(params)
    dims, coord_dict = config.configure_dims(params)
    params['ddpg_params']['scope'] = "mca"
    policy, reward_fun = config.configure_ddpg(dims=dims,
                                               params=params,
                                               active=True,
                                               clip_return=True)
    return policy, reward_fun
コード例 #4
0
ファイル: play.py プロジェクト: olliekrk/OpenAI_Gym_Panda
def load_policy(network, env_name, load_path):
    # Prepare params.
    params = DEFAULT_PARAMS
    # env_name = env.spec.id
    params["env_name"] = env_name
    if env_name in DEFAULT_ENV_PARAMS:
        params.update(
            DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in

    params = prepare_params(params)

    dims = configure_dims(params)
    policy = configure_ddpg(dims=dims, params=params)

    tf_util.load_variables(load_path)

    return policy
コード例 #5
0
    def __init__(self,
                 num_policies,
                 dims,
                 params,
                 clip_return,
                 num_goals=None,
                 pi_from_gi=None):
        #if explicitly stated, allow for policies to be shared among multiple consecutive subgoals, else one policy per subgoal
        # self.fst_sg_per_policy = np.arange(num_policies) if fst_sg_per_policy is None else fst_sg_per_policy
        self.num_goals = num_policies if num_goals is None else num_goals
        self.subgoals_achieved = 0
        # self.fst_sg_per_policy = [0,2]
        #mapping from g_inds to policy_inds:
        # self.pi_from_gi = np.arange(num_goals) if pi_from_gi is None else pi_from_gi
        self.pi_from_gi = np.zeros(
            num_goals) if pi_from_gi is None else pi_from_gi
        # policy_ind = -1
        # for sg_ind in self.fst_sg_per_policy:
        # 	if g_ind >= sg_ind:
        # 		policy_ind += 1

        self.policies = []
        for i in range(num_policies):
            #for now share params for all policies
            new_params = params.copy()
            #scope must be differently named for each policy to keep distinct
            new_params['ddpg_params']['scope'] = "ddpg" + str(i)
            new_params['policy_index'] = i
            # last_goal = self.fst_sg_per_policy[i+1] if (i+1 < len(self.fst_sg_per_policy)) else num_goals
            # relevant_goals = np.arange(self.fst_sg_per_policy[i],last_goal)
            # self.goals_per_policy.append(relevant_goals)
            # new_params['goal_indexes'] = relevant_goals
            self.policies.append(
                config.configure_ddpg(dims=dims.copy(),
                                      params=new_params.copy(),
                                      clip_return=clip_return))
        self.num_policies = num_policies
コード例 #6
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    if v['control_mode'] == 'linear':
        from sandbox.envs.maze.point_maze_env import PointMazeEnv
        inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode']))
        inner_env_test = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode']))
    elif v['control_mode'] == 'pos':
        from sandbox.envs.maze.point_maze_pos_env import PointMazeEnv
        inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode']))
        inner_env_test = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode']))

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])

    num_cpu = 1
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
            print("fancy call succeeded")
        except CalledProcessError:
            print("fancy version of mpi call failed, try simple version")
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()

    # Configure logging
    rank = MPI.COMM_WORLD.Get_rank()
    logdir = ''
    if rank == 0:
        if logdir or logger_b.get_dir() is None:
            logger_b.configure(dir=logdir)
    else:
        logger_b.configure()
    logdir = logger_b.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = v['seed'] + 1000000 * rank
    set_global_seeds(rank_seed)

    feasible_states = sample_unif_feas(inner_env, 10)
    if v['unif_starts']:
        starts = np.random.permutation(np.array(feasible_states))[:300]
    else:
        starts = np.array([[0, 0]])

    uniform_start_generator = UniformListStateGenerator(
        starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )


    # Prepare params.
    def make_env(inner_env=inner_env, terminal_eps=v['terminal_eps'], terminate_env=v['terminate_env']):
        return GoalStartExplorationEnv(
            env=inner_env, goal_generator=uniform_goal_generator,
            obs2goal_transform=lambda x: x[:v['goal_size']],
            start_generator=uniform_start_generator,
            obs2start_transform=lambda x: x[:v['goal_size']],
            terminal_eps=terminal_eps,
            distance_metric=v['distance_metric'],
            extend_dist_rew=v['extend_dist_rew'],
            only_feasible=v['only_feasible'],
            terminate_env=terminate_env,
            goal_weight=v['goal_weight'],
            inner_weight=0,
            append_goal_to_observation=False
        )


    env = make_env()
    test_env = make_env(inner_env=inner_env_test, terminal_eps=1., terminate_env=True)

    params = config.DEFAULT_PARAMS
    params['action_l2'] = v['action_l2']
    params['max_u'] = v['max_u']
    params['gamma'] = v['discount']
    params['env_name'] = 'FetchReach-v0'
    params['replay_strategy'] = v['replay_strategy']
    params['lr'] = v['lr']
    params['layers'] = v['layers']
    params['hidden'] = v['hidden']
    params['n_cycles'] = v['n_cycles']  # cycles per epoch
    params['n_batches'] = v['n_batches']  # training batches per cycle
    params['batch_size'] = v['batch_size']  # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
    params['n_test_rollouts'] = v['n_test_rollouts']  # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
    # exploration
    params['random_eps'] = 0.3  # percentagcone of time a random action is taken
    params['noise_eps'] = v['action_noise']
    params['goal_weight'] = v['goal_weight']

    with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    params['T'] = v['horizon']
    params['to_goal'] = v['to_goal']
    params['nearby_action_penalty'] = v['nearby_action_penalty']
    params['nearby_penalty_weight'] = v['nearby_penalty_weight']
    params['nearby_p'] = v['nearby_p']
    params['perturb_scale'] = v['perturb_scale']
    params['cells_apart'] = v['cells_apart']
    params['perturb_to_feasible'] = v['perturb_to_feasible']

    params['sample_expert'] = v['sample_expert']
    params['expert_batch_size'] = v['expert_batch_size']
    params['bc_loss'] = v['bc_loss']
    params['anneal_bc'] = v['anneal_bc']
    params['gail_weight']  =v['gail_weight']
    params['terminate_bootstrapping'] = v['terminate_bootstrapping']
    params['mask_q'] = int(v['mode'] == 'pure_bc')
    params['two_qs'] = v['two_qs']
    params['anneal_discriminator'] = v['anneal_discriminator']
    params['two_rs'] = v['two_qs'] or v['anneal_discriminator']
    params['with_termination'] = v['rollout_terminate']


    if 'clip_dis' in v and v['clip_dis']:
        params['dis_bound'] = v['clip_dis']

    params = config.prepare_params(params)
    params['make_env'] = make_env
    config.log_params(params, logger=logger_b)

    dims = config.configure_dims(params)
    # prepare GAIL

    if v['use_s_p']:
        discriminator = GAIL(dims['o'] + dims['o'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'],
                             dims['o'], dims['o'], dims['g'], 0., gail_loss = v['gail_reward'], use_s_p = True, only_s=v['only_s'])
    else:
        discriminator = GAIL(dims['o'] + dims['u'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'],
                             dims['o'], dims['u'], dims['g'], 0., gail_loss = v['gail_reward'], only_s=v['only_s'])

    params['discriminator'] = discriminator
    # configure replay buffer for expert buffer
    params_expert = {k:params[k] for k in ['make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination']}
    params_expert['replay_strategy'] = 'future' if v['relabel_expert'] else 'none'
    params_expert['sample_g_first'] = v['relabel_expert'] and v['sample_g_first']
    params_expert['zero_action_p'] = v['zero_action_p']

    params_policy_buffer = {k: params[k] for k in ['make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination']}
    params_policy_buffer['replay_strategy'] = 'future'
    params_policy_buffer['sample_g_first'] = False

    policy = config.configure_ddpg(dims=dims, params=params, clip_return=v['clip_return'], reuse=tf.AUTO_REUSE, env=env)


    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': True,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate']
    }

    expert_rollout_params = {
        'exploit': not v['noisy_expert'],
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate']
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        expert_rollout_params[name] = params[name]

    expert_rollout_params['noise_eps'] = v['expert_noise']
    expert_rollout_params['random_eps'] = v['expert_eps']

    rollout_worker = RolloutWorker([env], policy, dims, logger_b, **rollout_params)

    # prepare expert policy, rollout worker
    import joblib
    if v['expert_policy'] == 'planner':
        from sandbox.experiments.goals.maze.expert.maze_expert import MazeExpert
        expert_policy = MazeExpert(inner_env, step_size=0.2)
    else:
        expert_policy = joblib.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), v['expert_policy']))['policy']

    expert_rollout_worker = RolloutWorker([env], expert_policy, dims, logger_b, **expert_rollout_params)
    input_shapes = dims_to_shapes(dims)
    expert_sample_transitions = config.configure_her(params_expert)
    buffer_shapes = {key: (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key])
                     for key, val in input_shapes.items()}
    buffer_shapes['g'] = (buffer_shapes['g'][0], 2)
    buffer_shapes['ag'] = (v['horizon'] + 1, 2)
    buffer_shapes['successes'] = (v['horizon'],)
    expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], expert_sample_transitions)
    policy.expert_buffer = expert_buffer

    sample_transitions_relabel = config.configure_her(params_policy_buffer)

    normal_sample_transitions = policy.sample_transitions
    empty_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], normal_sample_transitions)

    if not v['query_expert'] or not 'gail' in v['mode']:
        for i in range(v['num_demos']):
            # rollout is generated by expert policy
            episode = expert_rollout_worker.generate_rollouts(reset=not v['no_resets'])
            # and is stored into the expert buffer
            expert_buffer.store_episode(episode)
            if i <= 20:
                path_length = np.argmax(episode['info_goal_reached'][0])
                path_length = v['horizon'] - 1 if path_length == 0 else path_length

                plot_path(episode['o'][0][:path_length], report=report, obs=True, goal=episode['g'][0][0], limit=v['goal_range'], center=v['goal_center'])
    report.new_row()


    # TODO: what is subsampling_rate
    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            tf.get_default_session().run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    init_new_vars_op = tf.initialize_variables(uninitialized_vars)
    tf.get_default_session().run(init_new_vars_op)

    outer_iter = 0
    logger.log('Generating the Initial Heatmap...')



    def evaluate_performance(env):
        four_rooms = np.array([[-2, -2], [-13, -13]])
        if v['unif_starts']:
            mean_rewards, successes = [], []
            for pos in four_rooms:
                env.update_start_generator(FixedStateGenerator(np.array(pos)))
                mr, scs = test_and_plot_policy(policy, env, horizon=v['horizon'],  max_reward=v['max_reward'], sampling_res=sampling_res,
                                               n_traj=v['n_traj'],
                                               itr=outer_iter, report=report, limit=v['goal_range'],
                                               center=v['goal_center'], using_gym=True,
                                               noise=v['action_noise'], n_processes=8, log=False)
                mean_rewards.append(mr)
                successes.append(scs)
            with logger.tabular_prefix('Outer_'):
                logger.record_tabular('iter', outer_iter)
                logger.record_tabular('MeanRewards', np.mean(mean_rewards))
                logger.record_tabular('Success', np.mean(successes))
        else:
            env.update_start_generator(FixedStateGenerator(np.array([0, 0])))
            _, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res,
                                          n_traj=v['n_traj'],
                                          itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                                          using_gym=True,
                                          noise=v['action_noise'], n_processes=8)

        report.new_row()

        env.update_start_generator(uniform_start_generator)

        return scs

    logger.dump_tabular(with_prefix=False)

    import cloudpickle
    max_success = 0.

    if not v['query_expert'] and v['num_demos'] > 0:
        if not v['relabel_expert']:
            goals = goals_filtered = expert_buffer.buffers['g'][:v['num_demos'], 0, :]
        else: # collect all states visited by the expert
            goals = None
            for i in range(v['num_demos']):
                terminate_index = np.argmax(expert_buffer.buffers['successes'][i])
                if np.logical_not(np.any(expert_buffer.buffers['successes'][i])):
                    terminate_index = v['horizon']
                cur_goals = expert_buffer.buffers['o'][i, :terminate_index, :2]
                if goals is None:
                    goals = cur_goals
                else:
                    goals = np.concatenate([goals, cur_goals])
            goal_state_collection = StateCollection(distance_threshold=v['coll_eps'])
            goal_state_collection.append(goals)
            goals_filtered = goal_state_collection.states
    else:
        goals_filtered = goals = np.random.permutation(np.array(feasible_states))[:300]
    if v['agent_state_as_goal']:
        goals = goals
    else:
        feasible_states = sample_unif_feas(inner_env, 10)
        goals = np.random.permutation(np.array(feasible_states))[:300]

    evaluate_performance(test_env)
    logger.dump_tabular(with_prefix=False)


    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            if v['unif_goals']:
                env.update_goal_generator(
                    UniformListStateGenerator(
                        goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                    )
                )
            else:
                env.update_goal_generator(FixedStateGenerator(v['final_goal']))

            logger.log("Training the algorithm")

            train(policy, discriminator, rollout_worker, v['inner_iters'], v['n_cycles'], v['n_batches'], v['n_batches_dis'], policy.buffer, expert_buffer,
                  empty_buffer=empty_buffer if v['on_policy_dis'] else None, num_rollouts=v['num_rollouts'], reset=not v['no_resets'],
                  feasible_states=feasible_states if v['query_expert'] else None, expert_policy=expert_policy if v['query_expert'] else None,
                  agent_policy=policy if v['query_agent'] else None, train_dis_per_rollout=v['train_dis_per_rollout'],
                  noise_expert=v['noise_dis_agent'], noise_agent=v['noise_dis_expert'], sample_transitions_relabel=sample_transitions_relabel if v['relabel_for_policy'] else None,
                  q_annealing=v['q_annealing'], outer_iter=outer_iter, annealing_coeff=v['annealing_coeff'])

        # logger.record_tabular('NonZeroRewProp', nonzeros)
        logger.log('Generating the Heatmap...')

        success = evaluate_performance(test_env)

        if success > max_success:
            print ("% f >= %f, saving policy to params_best" % (success, max_success))
            with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f:
                cloudpickle.dump({'env': env, 'policy': policy}, f)
                max_success = success

        report.new_row()

        logger.dump_tabular(with_prefix=False)
コード例 #7
0
def launch(env_name,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           temperature,
           prioritization,
           binding,
           logging,
           version,
           dump_buffer,
           n_cycles,
           rank_method,
           w_potential,
           w_linear,
           w_rotational,
           clip_energy,
           override_params={},
           save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu, binding)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging

    if logging:
        logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\
                 '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\
                 '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\
                 '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\
                 '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\
                 '-clip_energy'+str(clip_energy)+\
                 '-version'+str(version)
    else:
        logdir = osp.join(
            tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))

    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    params['temperature'] = temperature
    params['prioritization'] = prioritization
    params['binding'] = binding
    params['max_timesteps'] = n_epochs * params['n_cycles'] * params[
        'n_batches'] * num_cpu
    params['version'] = version
    params['dump_buffer'] = dump_buffer
    params['n_cycles'] = n_cycles
    params['rank_method'] = rank_method
    params['w_potential'] = w_potential
    params['w_linear'] = w_linear
    params['w_rotational'] = w_rotational
    params['clip_energy'] = clip_energy
    params['n_epochs'] = n_epochs
    params['num_cpu'] = num_cpu

    if params['dump_buffer']:
        params['alpha'] = 0

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          num_cpu=num_cpu,
          dump_buffer=dump_buffer,
          w_potential=params['w_potential'],
          w_linear=params['w_linear'],
          w_rotational=params['w_rotational'],
          rank_method=rank_method,
          clip_energy=clip_energy)
コード例 #8
0
def learn(*,
          network,
          env,
          total_timesteps,
          num_cpu,
          allow_run_as_root,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()
    logger.info('before mpi_fork: rank', rank, 'num_cpu',
                MPI.COMM_WORLD.Get_size())

    if num_cpu > 1:
        if allow_run_as_root:
            whoami = mpi_fork_run_as_root(num_cpu)
        else:
            whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            logger.info('parent exiting with code 0...')
            sys.exit(0)

        U.single_threaded_session().__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    num_cpu = MPI.COMM_WORLD.Get_size()
    logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu)

    override_params = override_params or {}

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    params['rollout_batch_size'] = env.num_envs
    params['num_cpu'] = num_cpu
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
    logger.info("actual total timesteps : {}".format(
        n_epochs * n_cycles * rollout_worker.T *
        rollout_worker.rollout_batch_size))

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 save_interval=save_interval,
                 demo_file=demo_file)
コード例 #9
0
ファイル: train.py プロジェクト: ZiyeHu/myCHER
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
コード例 #10
0
ファイル: train.py プロジェクト: xzblueofsky/baselines
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
コード例 #11
0
ファイル: train.py プロジェクト: wwxFromTju/curious
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs,
           structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        save_dir = find_save_path('./save/' + env + "/", trial_id)
        logger.configure(dir=save_dir)
    else:
        save_dir = None

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params, add main function arguments and log all parameters
    if structure == 'curious' or structure == 'task_experts':
        params = config.MULTI_TASK_PARAMS
    else:
        params = config.DEFAULT_PARAMS

    time = str(datetime.datetime.now())
    params['time'] = time
    params['env_name'] = env
    params['task_selection'] = task_selection
    params['goal_selection'] = goal_selection
    params['task_replay'] = task_replay
    params['goal_replay'] = goal_replay
    params['structure'] = structure
    params['normalize_obs'] = normalize_obs
    params['num_cpu'] = num_cpu
    params['clip_return'] = clip_return
    params['trial_id'] = trial_id
    params['seed'] = seed
    if rank == 0:
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(params, f)
    params = config.prepare_params(params)
    params['ddpg_params']['normalize_obs'] = normalize_obs
    if rank == 0:
        config.log_params(params, logger=logger)

    if num_cpu != 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)

    buffers = config.configure_buffer(dims=dims, params=params)

    # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy
    if structure == 'task_experts':
        policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])]
    else:
        policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return)


    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'structure': structure,
        'task_selection': task_selection,
        'goal_selection': goal_selection,
        'queue_length': params['queue_length'],
        'eval': False,
        'eps_task': params['eps_task']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'structure' : structure,
        'task_selection': task_selection,
        'goal_selection' : goal_selection,
        'queue_length': params['queue_length'],
        'eval': True,
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    if structure == 'task_experts':
        # create one rollout worker per policy/task
        rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])]
        for i in range(params['nb_tasks']):
            rollout_worker[i].seed(rank_seed + i)
    else:
        rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed + 100)

    train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb,
          policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
コード例 #12
0
ファイル: train.py プロジェクト: marcelo-dalmeida/baselines
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
コード例 #13
0
ファイル: train.py プロジェクト: nohboogy/Changed
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           k,
           n,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)
    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          k=k,
          n=n,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          threshold=params['threshold'])
コード例 #14
0
ファイル: her.py プロジェクト: RyanRizzo96/RL_baselines
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          policy_save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          **kwargs):

    # Check if there are any overriding params
    override_params = override_params or {}

    print(
        "---------------------------------------------------LEARN--------------------------------------------------"
    )

    # Check for MPI workers, if not none check rank and number of CPU's
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        print("MPI RANK: ", rank)
        num_cpu = MPI.COMM_WORLD.Get_size()
        print("num_cpu RANK: ", num_cpu)

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS  # Uses default parameters set inside config.py
    env_name = env.spec.id  # Unwrap the environment and get the id, Eg: CartPole-v0
    params['env_name'] = env_name  # Save env_name as a param
    params['replay_strategy'] = replay_strategy  # Add replay_strategy as param

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter

    # Output params in params.json file in logger directory
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    # Creates gym env, checks MPI rank, monitor initialization
    params = config.prepare_params(params)
    # params['rollout_batch_size'] = env.num_envs  # Not sure about this. Does this override the value in config.py?
    params['rollout_batch_size'] = env.num_envs
    print("rr/ env.num_envs - ", env.num_envs)

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    # Print out params before training
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    # Before call to DDPG we configure the dimensions of the observation, action and goal
    dims = config.configure_dims(params)

    # Call to initialize DDPG and create actor critic network
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    # Check if we are loading previous trained policy
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit':
        False,  # do not act optimally according to the current policy without any exploration
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q':
        False,  # whether or not to compute the Q values alongside the actions
        'T': params['T'],
    }

    eval_params = {
        'exploit':
        True,  # act optimally according to the current policy without any exploration
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q':
        True,  # whether or not to compute the Q values alongside the actions
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    print("rr/ n_cycles = ", n_cycles)
    print("rr/ n_batches", params['n_batches'])
    print("rr/ n_test_rollouts", params['n_test_rollouts'])
    print("rr/ rollout time horizon = ", rollout_worker.T)
    print("rr/ rollout_batch_size = ", rollout_worker.rollout_batch_size)

    # // for division without remainder
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
    print("rr/ n_epochs = ", n_epochs)

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file)
コード例 #15
0
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    # if rank == 0:
    #     if logdir or logger.get_dir() is None:
    #         logger.configure(dir=logdir)
    # else:
    #     logger.configure()
    # logdir = logger.get_dir()
    # assert logdir is not None
    # os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    # with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
    #     json.dump(params, f)
    # {'Q_lr': 0.001, 'action_l2': 1.0, 'batch_size': 256, 'buffer_size': 1000000, 'clip_obs': 200.0,
    # 'env_name': u'GazeboSmartBotPinc...Kinect-v0', 'hidden': 256, 'layers': 3, 'max_u': 1.0, 'n_batches': 40,
    # 'n_cycles': 50, 'n_test_rollouts': 10, 'network_class': 'baselines.her.actor...torCritic', 'noise_eps': 0.2, ...}
    params = config.prepare_params(
        params
    )  # {'T': 200, '_Q_lr': 0.001, '_action_l2': 1.0, '_batch_size': 256, '_buffer_size': 1000000, '_clip_obs': 200.0, '_hidden': 256, '_layers': 3, '_max_u': 1.0, '_network_class': 'baselines.her.actor...torCritic', '_norm_clip': 5, '_norm_eps': 0.01, '_pi_lr': 0.001, '_polyak': 0.95, ...}
    config.log_params(params, logger=logger)
    # T: 200
    # _Q_lr: 0.001
    # _action_l2: 1.0
    # _batch_size: 256
    # _buffer_size: 1000000
    # _clip_obs: 200.0
    # _hidden: 256
    # _layers: 3
    # _max_u: 1.0
    # _network_class: baselines.her.actor_critic:ActorCritic
    # _norm_clip: 5
    # _norm_eps: 0.01
    # _pi_lr: 0.001
    # _polyak: 0.95
    # _relative_goals: False
    # _scope: ddpg
    # ddpg_params: {'layers': 3, 'norm_clip': 5, 'action_l2': 1.0, 'pi_lr': 0.001, 'norm_eps': 0.01, 'batch_size': 256, 'polyak': 0.95, 'clip_obs': 200.0, 'network_class': 'baselines.her.actor_critic:ActorCritic', 'max_u': 1.0, 'Q_lr': 0.001, 'scope': 'ddpg', 'buffer_size': 1000000, 'hidden': 256, 'relative_goals': False}
    # env_name: GazeboSmartBotPincherKinect-v0
    # gamma: 0.995
    # make_env: <function make_env at 0x7f7de3cb2050>
    # n_batches: 40
    # n_cycles: 50
    # n_test_rollouts: 10
    # noise_eps: 0.2
    # random_eps: 0.3
    # replay_k: 4
    # replay_strategy: future
    # rollout_batch_size: 2
    # test_with_polyak: False

    # if num_cpu == 1:
    #     logger.warn()
    #     logger.warn('*** Warning ***')
    #     logger.warn(
    #         'You are running HER with just a single MPI worker. This will work, but the ' +
    #         'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
    #         'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
    #         'are looking to reproduce those results, be aware of this. Please also refer to ' +
    #         'https://github.com/openai/baselines/issues/314 for further details.')
    #     logger.warn('****************')
    #     logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
コード例 #16
0
ファイル: train.py プロジェクト: jfsantos/stable-baselines
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params=None,
           save_policies=True):
    """
    launch training with mpi

    :param env: (str) environment ID
    :param logdir: (str) the log directory
    :param n_epochs: (int) the number of training epochs
    :param num_cpu: (int) the number of CPUs to run on
    :param seed: (int) the initial random seed
    :param replay_strategy: (str) the type of replay strategy ('future' or 'none')
    :param policy_save_interval: (int) the interval with which policy pickles are saved.
        If set to 0, only the best and latest policy will be pickled.
    :param clip_return: (float): clip returns to be in [-clip_return, clip_return]
    :param override_params: (dict) override any parameter for training
    :param save_policies: (bool) whether or not to save the policies
    """

    if override_params is None:
        override_params = {}
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        tf_util.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(folder=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'),
              'w') as file_handler:
        json.dump(params, file_handler)
    params = config.prepare_params(params)
    config.log_params(params, logger_input=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        # 'use_demo_states': True,
        'compute_q': False,
        'time_horizon': params['time_horizon'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        # 'use_demo_states': False,
        'compute_q': True,
        'time_horizon': params['time_horizon'],
    }

    for name in [
            'time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps',
            'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
コード例 #17
0
    def prepare_agent(_env,
                      eval_env,
                      active,
                      exploration='eps_greedy',
                      action_l2=None,
                      scope=None,
                      ss=False,
                      load_path=None):
        # Prepare params.
        _params = copy.deepcopy(config.DEFAULT_PARAMS)
        _kwargs = copy.deepcopy(kwargs)
        _override_params = copy.deepcopy(override_params)

        env_name = _env.spec.id
        _params['env_name'] = env_name
        _params['replay_strategy'] = replay_strategy
        _params['ss'] = ss
        if action_l2 is not None:
            _params['action_l2'] = action_l2
        if not active:
            _params["buffer_size"] = 1
        if env_name in config.DEFAULT_ENV_PARAMS:
            _params.update(config.DEFAULT_ENV_PARAMS[env_name]
                           )  # merge env-specific parameters in
        _params.update(
            **_override_params)  # makes it possible to override any parameter
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(_params, f)
        _params = config.prepare_params(_params)
        _params['rollout_batch_size'] = _env.num_envs

        if demo_file is not None:
            _params['bc_loss'] = 1
        _params.update(_kwargs)

        config.log_params(_params, logger=logger)

        if num_cpu == 1:
            logger.warn()
            logger.warn('*** Warning ***')
            logger.warn(
                'You are running HER with just a single MPI worker. This will work, but the '
                +
                'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
                +
                'were obtained with --num_cpu 19. This makes a significant difference and if you '
                +
                'are looking to reproduce those results, be aware of this. Please also refer to '
                +
                'https://github.com/openai/baselines/issues/314 for further details.'
            )
            logger.warn('****************')
            logger.warn()

        dims, coord_dict = config.configure_dims(_params)
        _params['ddpg_params']['scope'] = scope
        policy, reward_fun = config.configure_ddpg(dims=dims,
                                                   params=_params,
                                                   active=active,
                                                   clip_return=clip_return)
        if load_path is not None:
            tf_util.load_variables(load_path)
            print(f"Loaded model: {load_path}")

        rollout_params = {
            'exploit': False,
            'use_target_net': False,
            'use_demo_states': True,
            'compute_Q': False,
            'exploration': exploration
        }

        eval_params = {
            'exploit': True,
            'use_target_net': _params['test_with_polyak'],
            'use_demo_states': False,
            'compute_Q': True,
        }

        for name in [
                'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
        ]:
            rollout_params[name] = _params[name]
            eval_params[name] = _params[name]

        eval_env = eval_env or _env

        rollout_worker = RolloutWorker(_env,
                                       policy,
                                       dims,
                                       logger,
                                       active,
                                       monitor=True,
                                       **rollout_params)
        evaluator = RolloutWorker(eval_env, policy, dims, logger, active,
                                  **eval_params)

        return policy, rollout_worker, evaluator, _params, coord_dict, reward_fun
コード例 #18
0
ファイル: her.py プロジェクト: MrGoogol/baselines
def learn(*, network, env, total_timesteps,
    seed=None,
    eval_env=None,
    replay_strategy='future',
    policy_save_interval=5,
    clip_return=True,
    demo_file=None,
    override_params=None,
    load_path=None,
    save_path=None,
    **kwargs
):

    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
         json.dump(params, f)
    params = config.prepare_params(params)
    params['rollout_batch_size'] = env.num_envs

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' +
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    return train(
        save_path=save_path, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, demo_file=demo_file)
コード例 #19
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    num_cpu = 1
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
            print("fancy call succeeded")
        except CalledProcessError:
            print("fancy version of mpi call failed, try simple version")
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()

    # Configure logging
    rank = MPI.COMM_WORLD.Get_rank()
    logdir = ''
    if rank == 0:
        if logdir or logger_b.get_dir() is None:
            logger_b.configure(dir=logdir)
    else:
        logger_b.configure()
    logdir = logger_b.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = v['seed'] + 1000000 * rank
    set_global_seeds(rank_seed)

    def make_env():
        return PnPEnv()

    env = make_env()
    test_env = make_env()
    env.reset()

    # for _ in range(1000):
    #     env.render()
    #     import pdb; pdb.set_trace()
    #     env.step(env.action_space.sample())

    params = config.DEFAULT_PARAMS
    params['action_l2'] = v['action_l2']
    params['max_u'] = v['max_u']
    params['gamma'] = v['discount']
    params['env_name'] = 'FetchReach-v0'
    params['replay_strategy'] = v['replay_strategy']
    params['lr'] = v['lr']
    params['layers'] = v['layers']
    params['hidden'] = v['hidden']
    params['n_cycles'] = v['n_cycles']  # cycles per epoch
    params['n_batches'] = v['n_batches']  # training batches per cycle
    params['batch_size'] = v[
        'batch_size']  # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
    params['n_test_rollouts'] = v[
        'n_test_rollouts']  # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
    # exploration
    params['random_eps'] = 0.3  # percentage of time a random action is taken
    params['noise_eps'] = v['action_noise']
    params['goal_weight'] = v['goal_weight']
    params['scope'] = 'ddpg3'

    params['sample_expert'] = v['sample_expert']
    params['expert_batch_size'] = v['expert_batch_size']
    params['bc_loss'] = v['bc_loss']
    params['anneal_bc'] = v['anneal_bc']
    params['gail_weight'] = v['gail_weight']
    params['terminate_bootstrapping'] = v['terminate_bootstrapping']
    params['mask_q'] = int(v['mode'] == 'pure_bc')
    params['two_qs'] = v['two_qs']
    params['anneal_discriminator'] = v['anneal_discriminator']
    params['two_rs'] = v['two_qs'] or v['anneal_discriminator']
    params['with_termination'] = v['rollout_terminate']

    if 'clip_dis' in v and v['clip_dis']:
        params['dis_bound'] = v['clip_dis']

    with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    params['T'] = v['horizon']
    params['to_goal'] = v['to_goal']

    params = config.prepare_params(params)
    params['make_env'] = make_env
    config.log_params(params, logger=logger_b)

    dims = config.configure_dims(params)

    # prepare GAIL
    if v['use_s_p']:
        discriminator = GAIL(dims['o'] + dims['o'] +
                             dims['g'] if not v['only_s'] else dims['o'] +
                             dims['g'],
                             dims['o'],
                             dims['o'],
                             dims['g'],
                             0.,
                             gail_loss=v['gail_reward'],
                             use_s_p=True,
                             only_s=v['only_s'])
    else:
        discriminator = GAIL(dims['o'] + dims['u'] +
                             dims['g'] if not v['only_s'] else dims['o'] +
                             dims['g'],
                             dims['o'],
                             dims['u'],
                             dims['g'],
                             0.,
                             gail_loss=v['gail_reward'],
                             only_s=v['only_s'])
    params['discriminator'] = discriminator

    # configure replay buffer for expert buffer
    params_expert = {
        k: params[k]
        for k in [
            'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs',
            'with_termination'
        ]
    }
    params_expert[
        'replay_strategy'] = 'future' if v['relabel_expert'] else 'none'

    params_policy_buffer = {
        k: params[k]
        for k in [
            'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs',
            'with_termination'
        ]
    }
    params_policy_buffer['replay_strategy'] = 'future'

    params_empty = {
        k: params[k]
        for k in [
            'make_env', 'replay_k', 'discriminator', 'gail_weight',
            'replay_strategy'
        ]
    }

    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=v['clip_return'],
                                   reuse=tf.AUTO_REUSE,
                                   env=env,
                                   to_goal=v['to_goal'])

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': True,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate'],
        'to_goal': v['to_goal']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate'],
        'to_goal': v['to_goal']
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker([env], policy, dims, logger_b,
                                   **rollout_params)
    # rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker([env], policy, dims, logger_b, **eval_params)
    # evaluator.seed(rank_seed)

    n_traj = v['n_evaluation_traj']

    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()
    inner_log_dir = osp.join(log_dir, 'inner_iters')
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)
    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    logger.log("Starting the outer iterations")

    logger.log("Generating heat map")

    def evaluate_pnp(env, policy, n_rollouts=100):
        goal_reached = []
        distance_to_goal = []
        for i in range(n_rollouts):
            traj = rollout(env,
                           policy,
                           max_path_length=v['horizon'],
                           using_gym=True)
            goal_reached.append(np.max(traj['env_infos']['goal_reached']))
            distance_to_goal.append(np.min(traj['env_infos']['distance']))

        return np.mean(goal_reached), np.mean(distance_to_goal)

    from sandbox.experiments.goals.pick_n_place.pnp_expert import PnPExpert

    expert_policy = PnPExpert(env)

    expert_params = {
        'exploit': not v['noisy_expert'],
        'use_target_net': False,
        'use_demo_states': False,
        'compute_Q': False,
        'T': params['T'],
        'weight': v['goal_weight'],
        'rollout_terminate': v['rollout_terminate'],
        'to_goal': v['to_goal']
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        expert_params[name] = params[name]

    expert_params['noise_eps'] = v['expert_noise']
    expert_params['random_eps'] = v['expert_eps']

    expert_worker = RolloutWorker([env], expert_policy, dims, logger_b,
                                  **expert_params)

    input_shapes = dims_to_shapes(dims)
    expert_sample_transitions = config.configure_her(params_expert)
    buffer_shapes = {
        key:
        (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key])
        for key, val in input_shapes.items()
    }
    buffer_shapes['g'] = (buffer_shapes['g'][0],
                          3 if not v['full_space_as_goal'] else 6)
    buffer_shapes['ag'] = (v['horizon'] + 1,
                           3 if not v['full_space_as_goal'] else 6)
    buffer_shapes['successes'] = (v['horizon'], )
    expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'],
                                 expert_sample_transitions)
    policy.expert_buffer = expert_buffer

    sample_transitions_relabel = config.configure_her(params_policy_buffer)

    for _ in range(v['num_demos']):
        # rollout is generated by expert policy
        episode = expert_worker.generate_rollouts(
            slice_goal=(3, 6) if v['full_space_as_goal'] else None)
        # and is stored into the current expert buffer
        expert_buffer.store_episode(episode)

        # TODO: what is subsampling_rate
    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            tf.get_default_session().run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    init_new_vars_op = tf.initialize_variables(uninitialized_vars)
    tf.get_default_session().run(init_new_vars_op)

    max_success, min_distance = evaluate_pnp(env, policy)
    outer_iter = 0
    logger.record_tabular("Outer_iter", outer_iter)
    logger.record_tabular("Outer_Success", max_success)
    logger.record_tabular("MinDisToGoal", min_distance)
    logger.dump_tabular()

    for outer_iter in range(1, v['outer_iters']):
        logger.log("Outer itr # %i" % outer_iter)

        with ExperimentLogger(inner_log_dir,
                              outer_iter,
                              snapshot_mode='last',
                              hold_outter_log=True):
            train(
                policy,
                discriminator,
                rollout_worker,
                v['inner_iters'],
                v['n_cycles'],
                v['n_batches'],
                v['n_batches_dis'],
                policy.buffer,
                expert_buffer,
                empty_buffer=empty_buffer if v['on_policy_dis'] else None,
                num_rollouts=v['num_rollouts'],
                feasible_states=feasible_states if v['query_expert'] else None,
                expert_policy=expert_policy if v['query_expert'] else None,
                agent_policy=policy if v['query_agent'] else None,
                train_dis_per_rollout=v['train_dis_per_rollout'],
                noise_expert=v['noise_dis_agent'],
                noise_agent=v['noise_dis_expert'],
                sample_transitions_relabel=sample_transitions_relabel
                if v['relabel_for_policy'] else None,
                outer_iter=outer_iter,
                annealing_coeff=v['annealing_coeff'],
                q_annealing=v['q_annealing'])

        print("evaluating policy performance")

        logger.log("Generating heat map")

        success, min_distance = evaluate_pnp(env, policy)

        logger.record_tabular("Outer_iter", outer_iter)
        logger.record_tabular("Outer_Success", max_success)
        logger.record_tabular("MinDisToGoal", min_distance)
        logger.dump_tabular()

        if success > max_success:
            print("% f >= %f, saving policy to params_best" %
                  (success, max_success))
            with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f:
                cloudpickle.dump({'env': env, 'policy': policy}, f)
            max_success = success

        report.save()
        report.new_row()
コード例 #20
0
ファイル: train.py プロジェクト: ruizhaogit/music
def launch(env_name,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           binding,
           logging,
           version,
           n_cycles,
           note,
           override_params={},
           save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu, binding)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging

    if logging:
        logdir = 'logs/' + str(env_name) + '-replay_strategy' + str(
            replay_strategy) + '-n_epochs' + str(n_epochs) + '-num_cpu' + str(
                num_cpu) + '-seed' + str(seed) + '-n_cycles' + str(
                    n_cycles) + '-version' + str(
                        version) + '-T-' + datetime.datetime.now().strftime(
                            "%Y-%m-%d-%H-%M-%S")
    else:
        logdir = osp.join(
            tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))

    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()  # use temp folder for other rank
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    params['binding'] = binding
    params['max_timesteps'] = n_epochs * params['n_cycles'] * params[
        'n_batches'] * num_cpu
    params['version'] = version
    params['n_cycles'] = n_cycles
    params['num_cpu'] = num_cpu
    params['note'] = note or params['note']
    if note:
        with open('params/' + env_name + '/' + note + '.json', 'r') as file:
            override_params = json.loads(file.read())
            params.update(**override_params)

    if params['load_weight']:
        if type(params['load_weight']) is list:
            params['load_weight'] = params['load_weight'][seed]
        base = os.path.splitext(params['load_weight'])[0]
        policy_weight_file = open(base + '_weight.pkl', 'rb')
        pretrain_weights = pickle.load(policy_weight_file)
        policy_weight_file.close()
    else:
        pretrain_weights = None

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   pretrain_weights=pretrain_weights,
                                   clip_return=clip_return)

    render = False
    if params['collect_video']:
        render = 'rgb_array'

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'render': render,
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          num_cpu=num_cpu,
          collect_data=params['collect_data'],
          collect_video=params['collect_video'],
          goal_generation=params['goal_generation'],
          num_skills=params['num_skills'],
          use_skill_n=params['use_skill_n'],
          batch_size=params['_batch_size'],
          mi_r_scale=params['mi_r_scale'],
          mi_end_epoch=params['mi_end_epoch'],
          sk_r_scale=params['sk_r_scale'],
          no_train_mi=params['no_train_mi'])
コード例 #21
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          policy_save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          n_ker=0,
          before_GER_minibatch_size=None,
          n_GER=0,
          err_distance=0.05,
          **kwargs):

    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if before_GER_minibatch_size is not None and n_GER is not None:
        params['batch_size'] = before_GER_minibatch_size * (n_GER + 1)
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    params['rollout_batch_size'] = env.num_envs

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return,
                                   n_GER=n_GER,
                                   err_distance=err_distance,
                                   env_name=env_name)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env_name,
                                   env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   n_ker=n_ker,
                                   **rollout_params)
    evaluator = RolloutWorker(env_name, eval_env, policy, dims, logger,
                              **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file,
                 env_name=env_name,
                 n_ker=n_ker)
コード例 #22
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          policy_save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          params=None,
          **kwargs):
    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = {
        # env
        'max_u': 1.,  # max absolute value of actions on different coordinates
        # ddpg
        'layers': 3,  # number of layers in the critic/actor networks
        'hidden': 256,  # number of neurons in each hidden layers
        'network_class': 'baselines.her.actor_critic:ActorCritic',
        'Q_lr': 0.001,  # critic learning rate
        'pi_lr': 0.001,  # actor learning rate
        'buffer_size': int(1E6),  # for experience replay
        'polyak': 0.95,  # polyak averaging coefficient
        'action_l2':
        1.0,  # quadratic penalty on actions (before rescaling by max_u)
        'clip_obs': 200.,
        'scope': 'ddpg',  # can be tweaked for testing
        'relative_goals': False,
        # training
        'n_cycles': 50,  # per epoch
        'rollout_batch_size': 2,  # per mpi thread
        'n_batches': 40,  # training batches per cycle
        'batch_size':
        256,  # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
        'n_test_rollouts':
        10,  # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
        'test_with_polyak': False,  # run test episodes with the target network
        # exploration
        'random_eps': 0.2,  # percentage of time a random action is taken
        'noise_eps':
        0.3,  # std of gaussian noise added to not-completely-random actions as a percentage of max_u
        # HER
        'replay_strategy': 'future',  # supported modes: future, none
        'replay_k':
        4,  # number of additional goals used for replay, only used if off_policy_data=future
        # normalization
        'norm_eps': 0.01,  # epsilon used for observation normalization
        'norm_clip': 5,  # normalized observations are cropped to this values
        'bc_loss':
        0,  # whether or not to use the behavior cloning loss as an auxilliary loss
        'q_filter':
        0,  # whether or not a Q value filter should be used on the Actor outputs
        'num_demo': 25,  # number of expert demo episodes
        'demo_batch_size':
        128,  #number of samples to be used from the demonstrations buffer, per mpi thread 128/1024 or 32/256
        'prm_loss_weight': 0.001,  #Weight corresponding to the primary loss
        'aux_loss_weight':
        0.0078,  #Weight corresponding to the auxilliary loss also called the cloning loss
        'perturb': kwargs['pert_type'],
        'n_actions': kwargs['n_actions'],
    }
    params['replay_strategy'] = replay_strategy
    if env is not None:
        env_name = env.spec.id
        params['env_name'] = env_name
        if env_name in config.DEFAULT_ENV_PARAMS:
            params.update(config.DEFAULT_ENV_PARAMS[env_name]
                          )  # merge env-specific parameters in
    else:
        params['env_name'] = 'NuFingers_Experiment'
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    if demo_file is not None:
        params['bc_loss'] = 1
        params['q_filter'] = 1
        params['n_cycles'] = 20
        params['random_eps'] = 0.1  # chip: ON
        params['noise_eps'] = 0.1  # chip: ON
        # params['batch_size']: 1024
    params = config.prepare_params(params)
    params['rollout_batch_size'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    if env is not None:
        dims = config.configure_dims(params)
    else:
        dims = dict(o=15, u=4, g=7, info_is_success=1)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    print("NAME={}".format(params['env_name']))

    print(rollout_params)

    if params['env_name'].find('NuFingers_Experiment') == -1:
        rollout_worker = RolloutWorker(env,
                                       policy,
                                       dims,
                                       logger,
                                       monitor=True,
                                       **rollout_params)
        evaluator = RolloutWorker(eval_env, policy, dims, logger,
                                  **eval_params)
    else:
        rollout_worker = RolloutNuFingers(policy,
                                          dims,
                                          logger,
                                          monitor=True,
                                          **rollout_params)
        evaluator = RolloutNuFingers(policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file)