def task_run_ss_ddpg_baselines_mc(params): import tensorflow as tf print("\n\nprocess " + str(params['id']) + " has started" + "-" * 200 + "\n") noGpu = params['noGpu'] render = False replay_buffer = None # random seed each time random.seed() RANDOM_SEED = random.randint(0, 2**32 - 1) # Overall Options episodes = params['episodes'] dir_name = params['dir_name'] #naming function get_extra_name = params['get_extra_name'] # configuring environment ENV_NAME = 'MountainCarContinuous-v0' env = gym.make(ENV_NAME) if noGpu: tfConfig = tf.ConfigProto(device_count={'GPU': 0}) else: tfConfig = None with tf.Graph().as_default() as graph: with tf.Session(config=tfConfig, graph=graph) as sess: # with tf.Session() as sess: # Reset the seed for random number generation set_global_seeds(RANDOM_SEED) env.seed(RANDOM_SEED) # Initialize agent, see class for available parameters base_agent = DDPG_Baselines_agent( env, sess, replay_buffer=replay_buffer, buffer_size=params['buffer_size'], batch_size=params['batch_size'], num_train_iterations=params['num_train_iterations'], num_steps_before_train=params['num_steps_before_train'], ou_epsilon=params['ou_epsilon'], ou_min_epsilon=params['ou_min_epsilon'], ou_epsilon_decay_factor=params['ou_epsilon_decay_factor'], ou_mu=params['ou_mu'], ou_sigma=params['ou_sigma'], ou_theta=params['ou_theta'], # actor_lr = params['actor_lr'], actor_lr=params['lr'], actor_h1=params['actor_h1'], actor_h2=params['actor_h2'], # critic_lr = params['critic_lr'], critic_lr=params['lr'], critic_h1=params['critic_h1'], critic_h2=params['critic_h2'], gamma=params['gamma'], tau=params['tau'], layer_norm=params['layer_norm'], normalize_observations=params['normalize_observations'], normalize_returns=params['normalize_returns'], critic_l2_reg=params['critic_l2_reg'], enable_popart=params['enable_popart'], clip_norm=params['clip_norm'], reward_scale=params['reward_scale'], lastLayerTanh=params['lastLayerTanh'], finalizeGraph=False) smart_start_agent = SmartStartContinuous( base_agent, env, sess, buffer_size=params['buffer_size'], exploitation_param=params['exploitation_param'], exploration_param=params['exploration_param'], eta=params['eta'], eta_decay_factor=params['eta_decay_factor'], n_ss=params['n_ss'], print_ss_stuff=True, # sigma=params['sigma'], # smart_start_selection_modified_distance_function=params['smart_start_selection_modified_distance_function'], nnd_mb_final_steps=params['nnd_mb_final_steps'], nnd_mb_steps_per_waypoint=params['nnd_mb_steps_per_waypoint'], nnd_mb_mean_per_stepsize=params['nnd_mb_mean_per_stepsize'], nnd_mb_std_per_stepsize=params['nnd_mb_std_per_stepsize'], nnd_mb_stepsizes_in_waypoint_radii=params[ 'nnd_mb_stepsizes_in_waypoint_radii'], nnd_mb_gamma=params['nnd_mb_gamma'], nnd_mb_horizontal_penalty_factor=params[ 'nnd_mb_horizontal_penalty_factor'], nnd_mb_horizon=params['nnd_mb_horizon'], nnd_mb_num_control_samples=params[ 'nnd_mb_num_control_samples'], nnd_mb_path_shortcutting=params['nnd_mb_path_shortcutting'], nnd_mb_steps_before_giving_up_on_waypoint=params[ 'nnd_mb_steps_before_giving_up_on_waypoint'], nnd_mb_load_dir_name=params['nnd_mb_load_dir_name'], nnd_mb_load_existing_training_data=params[ 'nnd_mb_load_existing_training_data'], nnd_mb_num_fc_layers=params['nnd_mb_num_fc_layers'], nnd_mb_depth_fc_layers=params['nnd_mb_depth_fc_layers'], nnd_mb_batchsize=params['nnd_mb_batchsize'], nnd_mb_lr=params['nnd_mb_lr'], nnd_mb_nEpoch=params['nnd_mb_nEpoch'], nnd_mb_fraction_use_new=params['nnd_mb_fraction_use_new'], nnd_mb_num_episodes_for_aggregation=params[ 'nnd_mb_num_episodes_for_aggregation'], nnd_mb_make_aggregated_dataset_noisy=params[ 'nnd_mb_make_aggregated_dataset_noisy'], nnd_mb_make_training_dataset_noisy=params[ 'nnd_mb_make_training_dataset_noisy'], nnd_mb_noise_actions_during_MPC_rollouts=params[ 'nnd_mb_noise_actions_during_MPC_rollouts'], nnd_mb_verbose=params['nnd_mb_verbose']) sess.graph.finalize() # Train the agent, summary contains training data summary = rlTrain(smart_start_agent, env, render=render, render_episode=False, print_steps=False, print_results=False, num_episodes=episodes, print_time=False, progress_bar=True, id=params['id'], num_ticks=params['num_ticks']) # type: Summary summary.add_params_to_param_dict(zz_RANDOM_SEED=RANDOM_SEED, zz_episodes=episodes, noGpu=noGpu) fp = summary.save(get_default_data_directory(dir_name), last_name_section=True, extra_name_append=get_extra_name(params)) print("\n\nprocess " + str(params['id']) + " has finished" + "!" * 200 + "\n")
if __name__ == "__main__": import gym from smartstart.reinforcementLearningCore.rlTrain import rlTrain # configuring environment ENV_NAME = 'MountainCarContinuous-v0' env = gym.make(ENV_NAME) if noGpu: tfConfig = tf.ConfigProto(device_count={'GPU': 0}) else: tfConfig = None with tf.Session(config=tfConfig) as sess: # Reset the seed for random number generation set_global_seeds(RANDOM_SEED) env.seed(RANDOM_SEED) # run parameters episodes = 1000 lastLayerTanh = True # Initialize agent, see class for available parameters agent = DDPG_Baselines_agent(env, sess, replay_buffer=None, buffer_size=100000, batch_size=64, num_train_iterations=1, num_steps_before_train=1, ou_epsilon=1.0,
def task_run_ddpg_baselines_mc(params): import tensorflow as tf print("\n\nprocess " + str(params['id']) + " has started" + "-" * 200 + "\n") noGpu = params['noGpu'] render = False replay_buffer = None # random seed each time random.seed() RANDOM_SEED = random.randint(0, 2**32 - 1) # Overall Options episodes = params['episodes'] dir_name = params['dir_name'] # naming function get_extra_name = params['get_extra_name'] # configuring environment env = Continuous_MountainCarEnv_Editted.make_timed_env( params['power_scalar'], max_episode_steps=params['max_episode_steps'], max_episode_seconds=params['max_episode_seconds']) buffer_size = params['buffer_size'] batch_size = params['batch_size'] num_train_iterations = params['num_train_iterations'] num_steps_before_train = params['num_steps_before_train'] ou_epsilon = params['ou_epsilon'] ou_min_epsilon = params['ou_min_epsilon'] ou_epsilon_decay_factor = params['ou_epsilon_decay_factor'] ou_mu = params['ou_mu'] ou_sigma = params['ou_sigma'] ou_theta = params['ou_theta'] actor_lr = params['actor_lr'] actor_h1 = params['actor_h1'] actor_h2 = params['actor_h1'] // 2 critic_lr = params['critic_lr'] critic_h1 = params['critic_h1'] critic_h2 = params['critic_h1'] // 2 gamma = params['gamma'] tau = params['tau'] layer_norm = params['layer_norm'] normalize_observations = params['normalize_observations'] normalize_returns = params['normalize_returns'] critic_l2_reg = params['critic_l2_reg'] enable_popart = params['enable_popart'] clip_norm = params['clip_norm'] reward_scale = params['reward_scale'] lastLayerTanh = params['lastLayerTanh'] if noGpu: tfConfig = tf.ConfigProto(device_count={'GPU': 0}) else: tfConfig = None with tf.Graph().as_default() as graph: with tf.Session(config=tfConfig, graph=graph) as sess: # with tf.Session() as sess: # Reset the seed for random number generation set_global_seeds(RANDOM_SEED) env.seed(RANDOM_SEED) # Initialize agent, see class for available parameters agent = DDPG_Baselines_agent( env, sess, replay_buffer=replay_buffer, buffer_size=buffer_size, batch_size=batch_size, num_train_iterations=num_train_iterations, num_steps_before_train=num_steps_before_train, ou_epsilon=ou_epsilon, ou_min_epsilon=ou_min_epsilon, ou_epsilon_decay_factor=ou_epsilon_decay_factor, ou_mu=ou_mu, ou_sigma=ou_sigma, ou_theta=ou_theta, actor_lr=actor_lr, actor_h1=actor_h1, actor_h2=actor_h2, critic_lr=critic_lr, critic_h1=critic_h1, critic_h2=critic_h2, gamma=gamma, tau=tau, layer_norm=layer_norm, normalize_observations=normalize_observations, normalize_returns=normalize_returns, critic_l2_reg=critic_l2_reg, enable_popart=enable_popart, clip_norm=clip_norm, reward_scale=reward_scale, lastLayerTanh=lastLayerTanh) # Train the agent, summary contains training data summary = rlTrain(agent, env, render=render, render_episode=False, print_steps=False, print_results=False, num_episodes=episodes, progress_bar=True, id=params['id'], num_ticks=params['num_ticks']) # type: Summary summary.add_params_to_param_dict(zz_RANDOM_SEED=RANDOM_SEED, zz_episodes=episodes, noGpu=noGpu) fp = summary.save(get_default_data_directory(dir_name), last_name_section=True, extra_name_append=get_extra_name(params)) print("\n\nprocess " + str(params['id']) + " has finished" + "!" * 200 + "\n")