コード例 #1
0
def task_run_ss_ddpg_baselines_mc(params):
    import tensorflow as tf

    print("\n\nprocess " + str(params['id']) + " has started" + "-" * 200 +
          "\n")

    noGpu = params['noGpu']
    render = False
    replay_buffer = None

    # random seed each time
    random.seed()
    RANDOM_SEED = random.randint(0, 2**32 - 1)

    # Overall Options
    episodes = params['episodes']
    dir_name = params['dir_name']

    #naming function
    get_extra_name = params['get_extra_name']

    # configuring environment
    ENV_NAME = 'MountainCarContinuous-v0'
    env = gym.make(ENV_NAME)

    if noGpu:
        tfConfig = tf.ConfigProto(device_count={'GPU': 0})
    else:
        tfConfig = None

    with tf.Graph().as_default() as graph:
        with tf.Session(config=tfConfig, graph=graph) as sess:
            # with tf.Session() as sess:
            # Reset the seed for random number generation
            set_global_seeds(RANDOM_SEED)
            env.seed(RANDOM_SEED)

            # Initialize agent, see class for available parameters
            base_agent = DDPG_Baselines_agent(
                env,
                sess,
                replay_buffer=replay_buffer,
                buffer_size=params['buffer_size'],
                batch_size=params['batch_size'],
                num_train_iterations=params['num_train_iterations'],
                num_steps_before_train=params['num_steps_before_train'],
                ou_epsilon=params['ou_epsilon'],
                ou_min_epsilon=params['ou_min_epsilon'],
                ou_epsilon_decay_factor=params['ou_epsilon_decay_factor'],
                ou_mu=params['ou_mu'],
                ou_sigma=params['ou_sigma'],
                ou_theta=params['ou_theta'],
                # actor_lr = params['actor_lr'],
                actor_lr=params['lr'],
                actor_h1=params['actor_h1'],
                actor_h2=params['actor_h2'],
                # critic_lr = params['critic_lr'],
                critic_lr=params['lr'],
                critic_h1=params['critic_h1'],
                critic_h2=params['critic_h2'],
                gamma=params['gamma'],
                tau=params['tau'],
                layer_norm=params['layer_norm'],
                normalize_observations=params['normalize_observations'],
                normalize_returns=params['normalize_returns'],
                critic_l2_reg=params['critic_l2_reg'],
                enable_popart=params['enable_popart'],
                clip_norm=params['clip_norm'],
                reward_scale=params['reward_scale'],
                lastLayerTanh=params['lastLayerTanh'],
                finalizeGraph=False)

            smart_start_agent = SmartStartContinuous(
                base_agent,
                env,
                sess,
                buffer_size=params['buffer_size'],
                exploitation_param=params['exploitation_param'],
                exploration_param=params['exploration_param'],
                eta=params['eta'],
                eta_decay_factor=params['eta_decay_factor'],
                n_ss=params['n_ss'],
                print_ss_stuff=True,
                # sigma=params['sigma'],
                # smart_start_selection_modified_distance_function=params['smart_start_selection_modified_distance_function'],
                nnd_mb_final_steps=params['nnd_mb_final_steps'],
                nnd_mb_steps_per_waypoint=params['nnd_mb_steps_per_waypoint'],
                nnd_mb_mean_per_stepsize=params['nnd_mb_mean_per_stepsize'],
                nnd_mb_std_per_stepsize=params['nnd_mb_std_per_stepsize'],
                nnd_mb_stepsizes_in_waypoint_radii=params[
                    'nnd_mb_stepsizes_in_waypoint_radii'],
                nnd_mb_gamma=params['nnd_mb_gamma'],
                nnd_mb_horizontal_penalty_factor=params[
                    'nnd_mb_horizontal_penalty_factor'],
                nnd_mb_horizon=params['nnd_mb_horizon'],
                nnd_mb_num_control_samples=params[
                    'nnd_mb_num_control_samples'],
                nnd_mb_path_shortcutting=params['nnd_mb_path_shortcutting'],
                nnd_mb_steps_before_giving_up_on_waypoint=params[
                    'nnd_mb_steps_before_giving_up_on_waypoint'],
                nnd_mb_load_dir_name=params['nnd_mb_load_dir_name'],
                nnd_mb_load_existing_training_data=params[
                    'nnd_mb_load_existing_training_data'],
                nnd_mb_num_fc_layers=params['nnd_mb_num_fc_layers'],
                nnd_mb_depth_fc_layers=params['nnd_mb_depth_fc_layers'],
                nnd_mb_batchsize=params['nnd_mb_batchsize'],
                nnd_mb_lr=params['nnd_mb_lr'],
                nnd_mb_nEpoch=params['nnd_mb_nEpoch'],
                nnd_mb_fraction_use_new=params['nnd_mb_fraction_use_new'],
                nnd_mb_num_episodes_for_aggregation=params[
                    'nnd_mb_num_episodes_for_aggregation'],
                nnd_mb_make_aggregated_dataset_noisy=params[
                    'nnd_mb_make_aggregated_dataset_noisy'],
                nnd_mb_make_training_dataset_noisy=params[
                    'nnd_mb_make_training_dataset_noisy'],
                nnd_mb_noise_actions_during_MPC_rollouts=params[
                    'nnd_mb_noise_actions_during_MPC_rollouts'],
                nnd_mb_verbose=params['nnd_mb_verbose'])

            sess.graph.finalize()

            # Train the agent, summary contains training data
            summary = rlTrain(smart_start_agent,
                              env,
                              render=render,
                              render_episode=False,
                              print_steps=False,
                              print_results=False,
                              num_episodes=episodes,
                              print_time=False,
                              progress_bar=True,
                              id=params['id'],
                              num_ticks=params['num_ticks'])  # type: Summary

            summary.add_params_to_param_dict(zz_RANDOM_SEED=RANDOM_SEED,
                                             zz_episodes=episodes,
                                             noGpu=noGpu)
            fp = summary.save(get_default_data_directory(dir_name),
                              last_name_section=True,
                              extra_name_append=get_extra_name(params))

            print("\n\nprocess " + str(params['id']) + " has finished" +
                  "!" * 200 + "\n")
コード例 #2
0
if __name__ == "__main__":
    import gym
    from smartstart.reinforcementLearningCore.rlTrain import rlTrain

    # configuring environment
    ENV_NAME = 'MountainCarContinuous-v0'
    env = gym.make(ENV_NAME)

    if noGpu:
        tfConfig = tf.ConfigProto(device_count={'GPU': 0})
    else:
        tfConfig = None

    with tf.Session(config=tfConfig) as sess:
        # Reset the seed for random number generation
        set_global_seeds(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        # run parameters
        episodes = 1000
        lastLayerTanh = True

        # Initialize agent, see class for available parameters
        agent = DDPG_Baselines_agent(env,
                                     sess,
                                     replay_buffer=None,
                                     buffer_size=100000,
                                     batch_size=64,
                                     num_train_iterations=1,
                                     num_steps_before_train=1,
                                     ou_epsilon=1.0,
def task_run_ddpg_baselines_mc(params):
    import tensorflow as tf

    print("\n\nprocess " + str(params['id']) + " has started" + "-" * 200 +
          "\n")

    noGpu = params['noGpu']
    render = False
    replay_buffer = None

    # random seed each time
    random.seed()
    RANDOM_SEED = random.randint(0, 2**32 - 1)

    # Overall Options
    episodes = params['episodes']
    dir_name = params['dir_name']

    # naming function
    get_extra_name = params['get_extra_name']

    # configuring environment
    env = Continuous_MountainCarEnv_Editted.make_timed_env(
        params['power_scalar'],
        max_episode_steps=params['max_episode_steps'],
        max_episode_seconds=params['max_episode_seconds'])

    buffer_size = params['buffer_size']
    batch_size = params['batch_size']
    num_train_iterations = params['num_train_iterations']
    num_steps_before_train = params['num_steps_before_train']
    ou_epsilon = params['ou_epsilon']
    ou_min_epsilon = params['ou_min_epsilon']
    ou_epsilon_decay_factor = params['ou_epsilon_decay_factor']
    ou_mu = params['ou_mu']
    ou_sigma = params['ou_sigma']
    ou_theta = params['ou_theta']
    actor_lr = params['actor_lr']
    actor_h1 = params['actor_h1']
    actor_h2 = params['actor_h1'] // 2
    critic_lr = params['critic_lr']
    critic_h1 = params['critic_h1']
    critic_h2 = params['critic_h1'] // 2
    gamma = params['gamma']
    tau = params['tau']
    layer_norm = params['layer_norm']
    normalize_observations = params['normalize_observations']
    normalize_returns = params['normalize_returns']
    critic_l2_reg = params['critic_l2_reg']
    enable_popart = params['enable_popart']
    clip_norm = params['clip_norm']
    reward_scale = params['reward_scale']
    lastLayerTanh = params['lastLayerTanh']

    if noGpu:
        tfConfig = tf.ConfigProto(device_count={'GPU': 0})
    else:
        tfConfig = None

    with tf.Graph().as_default() as graph:
        with tf.Session(config=tfConfig, graph=graph) as sess:
            # with tf.Session() as sess:
            # Reset the seed for random number generation
            set_global_seeds(RANDOM_SEED)
            env.seed(RANDOM_SEED)

            # Initialize agent, see class for available parameters
            agent = DDPG_Baselines_agent(
                env,
                sess,
                replay_buffer=replay_buffer,
                buffer_size=buffer_size,
                batch_size=batch_size,
                num_train_iterations=num_train_iterations,
                num_steps_before_train=num_steps_before_train,
                ou_epsilon=ou_epsilon,
                ou_min_epsilon=ou_min_epsilon,
                ou_epsilon_decay_factor=ou_epsilon_decay_factor,
                ou_mu=ou_mu,
                ou_sigma=ou_sigma,
                ou_theta=ou_theta,
                actor_lr=actor_lr,
                actor_h1=actor_h1,
                actor_h2=actor_h2,
                critic_lr=critic_lr,
                critic_h1=critic_h1,
                critic_h2=critic_h2,
                gamma=gamma,
                tau=tau,
                layer_norm=layer_norm,
                normalize_observations=normalize_observations,
                normalize_returns=normalize_returns,
                critic_l2_reg=critic_l2_reg,
                enable_popart=enable_popart,
                clip_norm=clip_norm,
                reward_scale=reward_scale,
                lastLayerTanh=lastLayerTanh)

            # Train the agent, summary contains training data
            summary = rlTrain(agent,
                              env,
                              render=render,
                              render_episode=False,
                              print_steps=False,
                              print_results=False,
                              num_episodes=episodes,
                              progress_bar=True,
                              id=params['id'],
                              num_ticks=params['num_ticks'])  # type: Summary

            summary.add_params_to_param_dict(zz_RANDOM_SEED=RANDOM_SEED,
                                             zz_episodes=episodes,
                                             noGpu=noGpu)
            fp = summary.save(get_default_data_directory(dir_name),
                              last_name_section=True,
                              extra_name_append=get_extra_name(params))

            print("\n\nprocess " + str(params['id']) + " has finished" +
                  "!" * 200 + "\n")