def setup_param_noise(self): assert self.param_noise is not None # Configure perturbed actor. self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) # Configure separate copy for stddev adoption. self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)
def train(self, env, nb_steps): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True) actor = Actor(nb_actions, layer_norm=True) # Seed everything to make things reproducible. seed = self.seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() #load_state("D:\project\osim-rl-helper\ddpg.pkl") training.train(env=env, param_noise=param_noise, restore=True, action_noise=action_noise, actor=actor, critic=critic, memory=memory, nb_epochs=1, nb_epoch_cycles=1, render_eval=False, reward_scale=1.0, render=False, normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=nb_steps, nb_rollout_steps=5, nb_eval_steps=5, batch_size=64) #save_state("D:\project\osim-rl-helper\ddpg.pkl") if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. # TODO: Change back to 1e6 memory = Memory(limit=int(1e2), state_shape=env.state_space.shape, action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() kwargs.pop('state_shape') training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) logger.configure(dir='/home/vaisakhs_shaj/Desktop/DeepReinforcementLearning/5_Deep_Deterministic_Policy_Gradients/LOGS/OSIM') # Create envs. env = ProstheticsEnv(visualize=True) env.change_model(model = '2D', difficulty = 0, prosthetic = True, seed=seed) #env.seed(seed) #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(2e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 2000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def setup(self, obs_shape, nb_actions, action_spec, noise_type, gamma=1., tau=0.01, layer_norm=True): super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec, noise_type, gamma, tau, layer_norm) self.action_spec_internal = action_spec self.obs_dim = obs_shape action_noise = None param_noise = None # Parse noise_type for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.memory = Memory(limit=int(500), action_shape=(nb_actions, ), observation_shape=obs_shape) self.critic = Critic(layer_norm=layer_norm, hidden_size=128) self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128) tf.reset_default_graph() # max_action = env.action_space.high self.ddpg = DDPG(actor=self.actor, critic=self.critic, memory=self.memory, observation_shape=obs_shape, action_shape=(nb_actions, ), gamma=gamma, tau=tau, action_noise=action_noise, param_noise=param_noise)
def train_ddpg(env, N_episodes): param_noise = None nb_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm)
def __init__(self, exp_folder, observation_shape, action_shape): tf.reset_default_graph() self.sess = tf.Session() conf_file = exp_folder + 'conf.yaml' with open(conf_file, 'r') as f: conf = yaml.load(f) self.obs = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') observation_range = (-5., 5.) normalized_obs0 = tf.clip_by_value(self.obs, observation_range[0], observation_range[1]) self.actor = Actor(action_shape[0])(normalized_obs0, reuse=False) #self.sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.sess, exp_folder + 'latest_graph')
def __init__(self, hyperparams, dX, dU): """Initializes the policy. Args: hyperparams: Dictionary of hyperparameters. dX: Dimension of state space. dU: Dimension of action space. """ PolicyOpt.__init__(self, hyperparams, dX, dU) self.dX = dX self.dU = dU self.epochs = hyperparams['epochs'] self.param_noise_adaption_interval = hyperparams[ 'param_noise_adaption_interval'] set_global_seeds(hyperparams['seed']) # Initialize DDPG policy self.pol = DDPG(Actor(dU, network=hyperparams['network'], **hyperparams['network_kwargs']), Critic(network=hyperparams['network'], **hyperparams['network_kwargs']), Memory(limit=hyperparams['memory_limit'], action_shape=(dU, ), observation_shape=(dX, )), observation_shape=(dX, ), action_shape=(dU, ), param_noise=AdaptiveParamNoiseSpec( initial_stddev=0.2, desired_action_stddev=0.2), **hyperparams['ddpg_kwargs']) sess = get_session() self.pol.initialize(sess) sess.graph.finalize() self.policy = self # Act method is contained in this class
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. #memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, memory = ESMemoryAdapter(limit=int(kwargs['buffer_size']), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape, forgetting_factor=kwargs['gamma'], overwrite_policy=kwargs['buffer_overwrite'], sample_policy=kwargs['buffer_sample'], batch_size=kwargs['batch_size']) del kwargs['buffer_size'] del kwargs['buffer_overwrite'] del kwargs['buffer_sample'] # observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. trial = int(logger.get_dir()[-3:]) seed = seed + 1000000 * rank + 10000 * trial logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create the opensim env. train_env = prosthetics_env.Wrapper( osim_env.ProstheticsEnv(visualize=kwargs['render']), frameskip=kwargs['frameskip'], reward_shaping=kwargs['reward_shaping'], reward_shaping_x=kwargs['reward_shaping_x'], feature_embellishment=kwargs['feature_embellishment'], relative_x_pos=kwargs['relative_x_pos'], relative_z_pos=kwargs['relative_z_pos']) train_env.change_model(model=kwargs['model'].upper(), prosthetic=kwargs['prosthetic'], difficulty=kwargs['difficulty'], seed=seed) if rank == 0: train_env = bench.Monitor(train_env, None) else: train_env = bench.Monitor( train_env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation: eval_env = prosthetics_env.EvaluationWrapper( osim_env.ProstheticsEnv(visualize=kwargs['render_eval']), frameskip=kwargs['eval_frameskip'], reward_shaping=kwargs['reward_shaping'], reward_shaping_x=kwargs['reward_shaping_x'], feature_embellishment=kwargs['feature_embellishment'], relative_x_pos=kwargs['relative_x_pos'], relative_z_pos=kwargs['relative_z_pos']) eval_env.change_model(model=kwargs['model'].upper(), prosthetic=kwargs['prosthetic'], difficulty=kwargs['difficulty'], seed=seed) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) else: eval_env = None # training.train() doesn't like the extra keyword args added for controlling the prosthetics env, so remove them. del kwargs['model'] del kwargs['prosthetic'] del kwargs['difficulty'] del kwargs['reward_shaping_x'] del kwargs['frameskip'] del kwargs['eval_frameskip'] del kwargs['crowdai_submit'] del kwargs['eval_only'] # Parse noise_type action_noise = None param_noise = None nb_actions = train_env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) actor_layer_sizes = [ int(x) for x in kwargs['actor_layer_sizes'].replace('[', '').replace( ']', '').split(',') ] critic_layer_sizes = [ int(x) for x in kwargs['critic_layer_sizes'].replace('[', '').replace( ']', '').split(',') ] del kwargs['actor_layer_sizes'] del kwargs['critic_layer_sizes'] logger.info('actor_layer_sizes', actor_layer_sizes) logger.info('critic_layer_sizes', critic_layer_sizes) # Configure components. memory = Memory(limit=int(1e6), action_shape=train_env.action_space.shape, observation_shape=train_env.observation_space.shape) critic = Critic(layer_norm=layer_norm, activation=kwargs['activation'], layer_sizes=critic_layer_sizes) actor = Actor(nb_actions, layer_norm=layer_norm, activation=kwargs['activation'], layer_sizes=actor_layer_sizes) del kwargs['activation'] # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) train_env.seed(seed) if eval_env: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=train_env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) train_env.close() if eval_env: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
class DDPG(tf.Module): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet def setup_param_noise(self): assert self.param_noise is not None # Configure perturbed actor. self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) # Configure separate copy for stddev adoption. self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 @tf.function def step(self, obs, apply_noise=True, compute_Q=True): normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs) if self.param_noise is not None and apply_noise: action = self.perturbed_actor(normalized_obs) else: action = actor_tf if compute_Q: normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf) q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() action += noise action = tf.clip_by_value(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): batch = self.memory.sample(batch_size=self.batch_size) obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1']) actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32) normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1) if self.normalize_returns and self.enable_popart: old_mean = self.ret_rms.mean old_std = self.ret_rms.std self.ret_rms.update(target_Q.flatten()) # renormalize Q outputs new_mean = self.ret_rms.mean new_std = self.ret_rms.std for vs in [self.critic.output_vars, self.target_critic.output_vars]: kernel, bias = vs kernel.assign(kernel * old_std / new_std) bias.assign((bias * old_std + old_mean - new_mean) / new_std) actor_grads, actor_loss = self.get_actor_grads(normalized_obs0) critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q) if MPI is not None: self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr) self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr) else: self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables)) self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables)) return critic_loss, actor_loss @tf.function def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1): normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms) target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1 return normalized_obs0, target_Q @tf.function def get_actor_grads(self, normalized_obs0): with tf.GradientTape() as tape: actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_loss = -tf.reduce_mean(critic_with_actor_tf) actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) if self.clip_norm: actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads] if MPI is not None: actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0) return actor_grads, actor_loss @tf.function def get_critic_grads(self, normalized_obs0, actions, target_Q): with tf.GradientTape() as tape: normalized_critic_tf = self.critic(normalized_obs0, actions) normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf)) # The first is input layer, which is ignored here. if self.critic_l2_reg > 0.: # Ignore the first input layer. for layer in self.critic.network_builder.layers[1:]: # The original l2_regularizer takes half of sum square. critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel)) critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables) if self.clip_norm: critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads] if MPI is not None: critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0) return critic_grads, critic_loss def initialize(self): if MPI is not None: sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) @tf.function def update_target_net(self): for var, target_var in zip(self.actor.variables, self.target_actor.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) for var, target_var in zip(self.critic.variables, self.target_critic.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) obs0 = self.stats_sample['obs0'] actions = self.stats_sample['actions'] normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_critic_tf = self.critic(normalized_obs0, actions) critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) stats = {} if self.normalize_returns: stats['ret_rms_mean'] = self.ret_rms.mean stats['ret_rms_std'] = self.ret_rms.std if self.normalize_observations: stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean) stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std) stats['reference_Q_mean'] = tf.reduce_mean(critic_tf) stats['reference_Q_std'] = reduce_std(critic_tf) stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf) stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf) stats['reference_action_mean'] = tf.reduce_mean(actor_tf) stats['reference_action_std'] = reduce_std(actor_tf) if self.param_noise: perturbed_actor_tf = self.perturbed_actor(normalized_obs0) stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf) stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf) stats.update(self.param_noise.get_stats()) return stats def adapt_param_noise(self, obs0): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. mean_distance = self.get_mean_distance(obs0).numpy() if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce(mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance @tf.function def get_mean_distance(self, obs0): # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev) normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs0) adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0) mean_distance = tf.sqrt(tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf))) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): logging.basicConfig(filename='noGazebo_ddpg.log', level=logging.DEBUG, filemode="w") logging.getLogger().addHandler(logging.StreamHandler()) # Configure logger for the process with rank 0 (main-process?) # MPI = Message Passing Interface, for parallel computing; rank = process identifier within a group of processes rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Disable logging for rank != 0 to avoid noise. logging.debug( "I'm MPI worker {} and I guess I just log nothing".format(rank)) logger.set_level(logger.DISABLED) logging.disable(logging.CRITICAL) logging.info( "********************************************* Starting RL algorithm *********************************************" ) now = datetime.datetime.now() logging.info(now.isoformat()) # Create envs. env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[0] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. (initialize memory, critic & actor objects) logging.info("action space of env: {}".format(env.action_space)) # Box(2,) logging.info("observation space of env: {}".format( env.observation_space)) # Box(51200,) memory = Memory(limit=int(1e4), action_shape=(env.action_space.shape[0], ), observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Train the RL algorithm start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) # Training is done env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time)) now = datetime.datetime.now() logging.info(now.isoformat()) logging.info( "********************************************* End of RL algorithm *********************************************" ) return True
def main(): with U.single_threaded_session() as sess: batch_size = 64 current_noise_type = 'adaptive-param_0.2' _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) param_noise_adaption_interval = 2 env = gym.make("Pendulum-v0") nb_actions = env.action_space.shape[-1] layer_norm = True # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = int(1000000 * np.random.rand()) logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, batch_size=batch_size, param_noise=param_noise) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() for t in itertools.count(): episode_rewards = [] done = False while not done: env.render() # Take action and update exploration to the newest value action, q = agent.pi(obs, apply_noise=True, compute_Q=True) new_obs, rew, done, _ = env.step(max_action * action) # Book-keeping. agent.store_transition(obs, action, rew, new_obs, done) obs = new_obs episode_rewards.append(rew) if done: agent.reset() obs = env.reset() nb_train_steps = 100 epoch_adaptive_distances = [] epoch_critic_losses = [] epoch_actor_losses = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if t % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1)) logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses))) logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses))) logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances))) logger.dump_tabular()
def train(self, env_fn, num_timesteps, noise_type, layer_norm, folder, load_policy, video_width, video_height, plot_rewards, save_every=50, seed=1234, episode_length=1000, pi_hid_size=150, pi_num_hid_layers=3, render_frames=_render_frames, **kwargs): num_cpu = self.workers if sys.platform == 'darwin': num_cpu //= 2 config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu) if self.gpu_usage is None or self.gpu_usage <= 0.: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers tf.Session(config=config).__enter__() worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(worker_seed) tf.set_random_seed(worker_seed) np.random.seed(worker_seed) save_every = max(1, save_every) env = env_fn() env.seed(worker_seed) rank = MPI.COMM_WORLD.Get_rank() logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed, logger.get_dir())) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.INFO) that = self iter_name = 'iters_so_far' if self.method == 'sql': iter_name = 'epoch' # TODO replace with utils.create_callback(...) def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name]))) if self.method == "ppo": pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=1024, # 256 clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # 'linear' callback=callback) elif self.method == "trpo": trpo_mpi.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_batch=1024, max_kl=0.1, # 0.01 cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=callback) elif self.method == "acktr": from algos.acktr import acktr with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) acktr.learn( env, pi=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1024, desired_kl=0.01, # 0.002 num_timesteps=num_timesteps, animate=False, callback=callback) elif self.method == "ddpg": from algos.ddpg import ddpg # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import AdaptiveParamNoiseSpec param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import NormalActionNoise action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) ddpg.train( env=env, eval_env=None, param_noise=param_noise, render=False, render_eval=False, action_noise=action_noise, actor=actor, critic=critic, memory=memory, callback=callback, **kwargs) elif self.method == "sql": from softqlearning.algorithms import SQL from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel from softqlearning.misc.utils import timestamp from softqlearning.replay_buffers import SimpleReplayBuffer from softqlearning.value_functions import NNQFunction from softqlearning.policies import StochasticNNPolicy from rllab.envs.gym_env import GymEnv env = GymEnv(env) variant = { 'seed': [1, 2, 3], 'policy_lr': 3E-4, 'qf_lr': 3E-4, 'discount': 0.99, 'layer_size': 128, 'batch_size': 128, 'max_pool_size': 1E6, 'n_train_repeat': 1, 'epoch_length': 1000, 'snapshot_mode': 'last', 'snapshot_gap': 100, } pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=episode_length, epoch_length=episode_length, n_epochs=num_timesteps, max_path_length=episode_length, batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, iter_callback=callback ) qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers), ) pi_layers = tuple([pi_hid_size] * pi_num_hid_layers) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=1, save_full_state=False, ) algorithm.train() else: print('ERROR: Invalid "method" argument provided.', file=sys.stderr) env.close()
def __init__( self, env, gamma, total_timesteps, network='mlp', nb_rollout_steps=100, reward_scale=1.0, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=False, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, <- HERE! nb_eval_steps=100, buffer_size=1000000, batch_size=64, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): # Adjusting hyper-parameters by considering the number of options policies to learn num_options = env.get_number_of_options() buffer_size = num_options * buffer_size batch_size = num_options * batch_size observation_space = env.option_observation_space action_space = env.option_action_space nb_actions = action_space.shape[-1] assert (np.abs(action_space.low) == action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=buffer_size, action_shape=action_space.shape, observation_shape=observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, observation_space.shape, action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # Variables that are used during learning self.agent = agent self.memory = memory self.max_action = max_action self.batch_size = batch_size self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.param_noise_adaption_interval = param_noise_adaption_interval
def learn( network, env, data_path='', model_path='./model/', model_name='ddpg_none_fuzzy_150', file_name='test', model_based=False, memory_extend=False, model_type='linear', restore=False, dyna_learning=False, seed=None, nb_epochs=5, # with default settings, perform 1M steps total nb_sample_cycle=5, nb_epoch_cycles=150, nb_rollout_steps=400, nb_model_learning=10, nb_sample_steps=50, nb_samples_extend=5, reward_scale=1.0, noise_type='normal_0.2', #'adaptive-param_0.2', ou_0.2, normal_0.2 normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=32, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): nb_actions = env.action_space.shape[0] memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) if model_based: """ store fake_data""" fake_memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) """ select model or not """ if model_type == 'gp': kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) dynamic_model = GaussianProcessRegressor(kernel=kernel) reward_model = GaussianProcessRegressor(kernel=kernel) elif model_type == 'linear': dynamic_model = LinearRegression() reward_model = LinearRegression() elif model_type == 'mlp': dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) reward_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: logger.info( "You need to give the model_type to fit the dynamic and reward!!!" ) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) """ set noise """ action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) """action scale""" max_action = env.action_high_bound logger.info( 'scaling actions by {} before executing in env'.format(max_action)) """ agent ddpg """ agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape[0], gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else: agent.initialize(sess) sess.graph.finalize() agent.reset() episodes = 0 epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_states = [] for epoch in range(nb_epochs): logger.info( "======================== The {} epoch start !!! =========================" .format(epoch)) epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_times = [] epoch_actions = [] epoch_episode_states = [] epoch_qs = [] epoch_episodes = 0 for cycle in range(nb_epoch_cycles): start_time = time.time() obs, state, done = env.reset() obs_reset = cp.deepcopy(obs) episode_reward = 0. episode_step = 0 episode_states = [] logger.info( "================== The {} episode start !!! ===================" .format(cycle)) for t_rollout in range(nb_rollout_steps): logger.info( "================== The {} steps finish !!! ===================" .format(t_rollout)) """ Predict next action """ action, q, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=True) new_obs, next_state, r, done, safe_or_not, final_action = env.step( max_action * action, t_rollout) if safe_or_not is False: break episode_reward += r episode_step += 1 episode_states.append([ cp.deepcopy(state), cp.deepcopy(final_action), np.array(cp.deepcopy(r)), cp.deepcopy(next_state) ]) epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs state = next_state if done: break """ extend the memory """ if model_based and cycle > (nb_model_learning + 1) and memory_extend: pred_x = np.zeros((1, 18), dtype=np.float32) for j in range(nb_samples_extend): m_action, _, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=False) pred_x[:, :12] = obs pred_x[:, 12:] = m_action m_new_obs = dynamic_model.predict(pred_x)[0] """ get real reward """ # state = env.inverse_state(m_new_obs) # m_reward = env.get_reward(state, m_action) m_reward = reward_model.predict(pred_x)[0] agent.store_transition(obs, m_action, m_reward, m_new_obs, done) """ generate new data and fit model""" if model_based and cycle > nb_model_learning: logger.info( "============================== Model Fit !!! ===============================" ) input_x = np.concatenate( (memory.observations0.data[:memory.nb_entries], memory.actions.data[:memory.nb_entries]), axis=1) input_y_obs = memory.observations1.data[:memory.nb_entries] input_y_reward = memory.rewards.data[:memory.nb_entries] dynamic_model.fit(input_x, input_y_obs) reward_model.fit(input_x, input_y_reward) if dyna_learning: logger.info( "========================= Collect data !!! =================================" ) pred_obs = np.zeros((1, 18), dtype=np.float32) for sample_index in range(nb_sample_cycle): fake_obs = obs_reset for t_episode in range(nb_sample_steps): fake_action, _, _, _ = agent.step(fake_obs, stddev, apply_noise=True, compute_Q=False) pred_obs[:, :12] = fake_obs pred_obs[:, 12:] = fake_action next_fake_obs = dynamic_model.predict(pred_obs)[0] fake_reward = reward_model.predict(pred_obs)[0] # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0] # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0] fake_obs = next_fake_obs fake_terminals = False fake_memory.append(fake_obs, fake_action, fake_reward, next_fake_obs, fake_terminals) """ noise decay """ stddev = float(stddev) * 0.95 duration = time.time() - start_time epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_times.append(cp.deepcopy(duration)) epoch_episode_states.append(cp.deepcopy(episode_states)) epochs_rewards[epoch, cycle] = episode_reward epochs_steps[epoch, cycle] = episode_step epochs_times[epoch, cycle] = cp.deepcopy(duration) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_rewards)) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_times)) epoch_episodes += 1 episodes += 1 """ Training process """ epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): logger.info("") # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() """ planning training """ if model_based and cycle > (nb_model_learning + 1) and dyna_learning: for t_train in range(nb_train_steps): # setting for adapt param noise, if necessary. if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) batch = fake_memory.sample(batch_size=batch_size) fake_cl, fake_al = agent.train_fake_data(batch) epoch_critic_losses.append(fake_cl) epoch_actor_losses.append(fake_al) agent.update_target_net() epochs_states.append(cp.deepcopy(epoch_episode_states)) # # save data np.save( data_path + 'train_reward_' + algorithm_name + '_' + noise_type + file_name, epochs_rewards) np.save( data_path + 'train_step_' + algorithm_name + '_' + noise_type + file_name, epochs_steps) np.save( data_path + 'train_states_' + algorithm_name + '_' + noise_type + file_name, epochs_states) np.save( data_path + 'train_times_' + algorithm_name + '_' + noise_type + file_name, epochs_times) # # agent save agent.store(model_path + 'train_model_' + algorithm_name + '_' + noise_type + file_name)
def run(env_id, seed, noise_type, layer_norm, evaluation, perform, use_expert, expert_dir, use_trpo_expert, expert_limit, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) if evaluation and perform: perform = False if evaluation and rank == 0 or perform: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) # env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) if use_expert: expert = Expert(limit=expert_limit, env=env) if expert_dir is None: expert_dir = os.path.join('./expert', env.env.spec.id) + '/expert.pkl' expert.load_file(expert_dir) elif use_trpo_expert: assert expert_dir is not None expert = Expert(limit=expert_limit, env=env) expert.load_file_trpo(expert_dir) else: expert = None # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, perform=perform, expert=expert, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. # env = gym.make(env_id) # env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None #dc = TestContainer(num_assets=3, num_samples=20000) dc = BitcoinTestContainer(csv_file_name='../../../data/csvs/output.csv') env = TradingStateModel(datacontainer=dc, episode_length=kwargs['nb_rollout_steps'], is_training=True, commission_percentage=COMMISSION_PERCENTAGE) # Parse noise_type action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] nb_actions = env.datacontainer.num_assets for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(num_asset_features=env.datacontainer.total_asset_features, num_actions=env.datacontainer.num_assets, asset_features_shape=env.asset_features_shape, portfolio_features_shape=env.portfolio_features_shape, layer_norm=layer_norm) actor = Actor(nb_actions, num_asset_features=env.datacontainer.total_asset_features, num_actions=env.datacontainer.num_assets, asset_features_shape=env.asset_features_shape, portfolio_features_shape=env.portfolio_features_shape, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) # env.seed(seed) # if eval_env is not None: # eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, tensorboard_directory='./tensorboard_' + str(COMMISSION_PERCENTAGE), infer_directory='./infer_ims_' + str(COMMISSION_PERCENTAGE), **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. if env_id == 'navigate': env = NavigateEnv(use_camera=False, continuous_actions=True, neg_reward=False, max_steps=500) elif env_id == 'toy': #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000, obstacle_mode=continuous_gridworld.NO_OBJECTS) from toy_environment import room_obstacle_list env = gridworld.Gridworld(room_obstacle_list.obstacle_list, step_size=0.2) elif env_id == 'arm2pos': env = Arm2PosEnv(continuous=True, max_steps=500, neg_reward=False) elif env_id == 'pick-and-place': env = PickAndPlaceEnv(max_steps=500) else: env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) # env = gym.wrappers.Monitor(env, '/tmp/ddpg/', force=True) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() del kwargs['tb_dir'] del kwargs['save_path'] hindsight_mode = kwargs['hindsight_mode'] del kwargs['hindsight_mode'] training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, hindsight_mode=hindsight_mode, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(): args = parse_args() logger.configure() gamma = 0.99 tau = 0.01 normalize_returns = False normalize_observations = True batch_size = 64 action_noise = None stddev = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) critic_l2_reg = 1e-2 actor_lr = 1e-4 critic_lr = 1e-3 popart = False clip_norm = None reward_scale = 1. env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) top_model_dir = 'top-models/' # create tf sessions and graphs sess_list = [] graph_list = [] for i in range(len(args.model_files)): graph_list.append(tf.Graph()) sess_list.append(tf.Session(graph=graph_list[i])) ddpg_agents = [] for i in range(len(args.model_files)): model_name = args.model_files[i] sess = sess_list[i] graph = graph_list[i] l_size = args.layer_sizes[i] with sess.as_default(): #with U.make_session(num_cpu=1, graph=g) as sess: with graph.as_default(): #tf.global_variables_initializer() # restore agents from model files and store in ddpg_agents print("Restoring from..." + model_name) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) actor = Actor(env.action_space.shape[-1], layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # restore adam state and param noise restore_model_path = top_model_dir + model_name saver = tf.train.Saver(max_to_keep=500) # restore network weights saver.restore(sess, restore_model_path) adam_optimizer_store = pickle.load(open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] # intialize and prepare agent session. agent.initialize(sess) #sess.graph.finalize() agent.reset() ddpg_agents.append(agent) agent = BlendedAgent(ddpg_agents, sess_list, graph_list) if args.evaluation: # setup eval env eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) nb_eval_steps = 1000 # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list, # nb_eval_steps=nb_eval_steps, # render=False) reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False) print("Reward: " + str(reward)) print("Mean Q: " + str(mean_q)) print("Final num steps: " + str(final_steps)) # Submit to crowdai competition. What a hack. :) # if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if args.crowdai_submit: remote_base = "http://grader.crowdai.org:1729" crowdai_client = Client(remote_base) eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action(action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True) # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug("done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) crowdai_client.submit() for i in range(len(sess_list)): sess_list[i].close()
nb_action = 1 observation_shape = (3, ) t_train_time = 10000 t_test_time = 10000 network = 'mlp' action_noise = None param_noise = None popart = False, load_path = 'ddpg_model' load_path = None memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network) actor = Actor(nb_action, network=network) agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=0.99, tau=0.01, normalize_returns=False, normalize_observations=True, batch_size=32, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=1e-2, actor_lr=1e-4,
def run(env_id, seed, noise_type, layer_norm, evaluation, outdir, no_hyp, **kwargs): params = locals() # Configure things. # rank = MPI.COMM_WORLD.Get_rank() # if rank != 0: logger.set_level(logger.DISABLED) rank = 0 # Create envs. env = make_env(env_id) weight_file = kwargs.pop('weight_file') if not weight_file: outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id) else: outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id, 'eval') logger.configure(outdir) os.makedirs(outdir, exist_ok=True) env = bench.Monitor(env, os.path.join(outdir, "%i.monitor.json" % rank)) gym.logger.setLevel(logging.WARN) logger.info('Output directory:{}, env:{}, no_hyp:{}'.format( outdir, env_id, no_hyp)) if evaluation: eval_env = make_env(env_id) eval_env.seed(42) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'), allow_early_resets=True) # env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e5), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # critic = models.ConvCritic(layer_norm=layer_norm) # actor = models.ConvActor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() # set_global_seeds(seed) # env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() if weight_file: evaluate( env, nb_episodes=kwargs.get('nb_epochs', 100), reward_scale=kwargs.get('reward_scale'), render=kwargs.get('render'), param_noise=None, action_noise=None, actor=actor, critic=critic, critic_l2_reg=kwargs.get('critic_l2_reg'), memory=memory, weight_file=weight_file, ) else: training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, outdir=outdir, no_hyp=no_hyp, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))