def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Create envs. env = gym.make(env_id) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(0))) if evaluation: eval_env = gym.make(env_id) eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, **kwargs): # Create envs. env = gym.make(env_id) # Parse noise_type action_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) if seed > 0: np.random.seed(seed) torch.manual_seed(seed) random.seed(seed) env.seed(seed) torch.cuda.manual_seed(seed) start_time = time.time() train(env=env, action_noise=action_noise, memory=memory, **kwargs) env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))
def make(self): env = gym.make(self.env_id) if logger.get_dir() is not None: monitor_dir = os.path.join(logger.get_dir(), "gym_monitor") resume = True force = False else: monitor_dir = "/tmp/gym-monitoring" resume = False force = True env = gym.wrappers.Monitor(env, directory=monitor_dir, video_callable=False, force=force, resume=resume, write_upon_reset=True) if isinstance(env.unwrapped, AtariEnv): if '-ram-' in self.env_id: assert 'NoFrameskip' not in self.env_id env = ScaledFloatFrame(env) else: env = ScaledFloatFrame(wrap_atari_pg(env)) return env
def train(self): obs = self._env.reset() episode_rewards = [] n_episodes = 0 l_episode_return = deque([], maxlen=10) l_discounted_episode_return = deque([], maxlen=10) l_tq_squared_error = deque(maxlen=50) log_itr = -1 for itr in range(self._initial_step, self._max_steps): act = self.eps_greedy(obs[np.newaxis, :], self.exploration.value(itr)) next_obs, rew, done, _ = self._env.step(act) if self._render: self._env.render() self._replay_buffer.add(obs, act, rew, next_obs, float(done)) episode_rewards.append(rew) if done: obs = self._env.reset() episode_return = np.sum(episode_rewards) discounted_episode_return = np.sum( episode_rewards * self._discount ** np.arange(len(episode_rewards))) l_episode_return.append(episode_return) l_discounted_episode_return.append(discounted_episode_return) episode_rewards = [] n_episodes += 1 else: obs = next_obs if itr % self._target_q_update_freq == 0 and itr > self._learning_start_itr: self._update_target_q() if itr % self._train_q_freq == 0 and itr > self._learning_start_itr: # Sample from replay buffer. l_obs, l_act, l_rew, l_obs_prime, l_done = self._replay_buffer.sample( self._opt_batch_size) # Train Q value function with sampled data. td_squared_error = self.train_q( l_obs, l_act, l_rew, l_obs_prime, l_done) l_tq_squared_error.append(td_squared_error) if (itr + 1) % self._log_freq == 0 and len(l_episode_return) > 5: log_itr += 1 logger.logkv('Iteration', log_itr) logger.logkv('Steps', itr) logger.logkv('Epsilon', self.exploration.value(itr)) logger.logkv('Episodes', n_episodes) logger.logkv('AverageReturn', np.mean(l_episode_return)) logger.logkv('AverageDiscountedReturn', np.mean(l_discounted_episode_return)) logger.logkv('TDError^2', np.mean(l_tq_squared_error)) logger.dumpkvs() self._q.dump(logger.get_dir() + '/weights.pkl')
def test(self, epsilon): try: self._q.set_params(self._q.load(logger.get_dir() + '/weights.pkl')) except Exception as e: print(e) obs = self._env.reset() while True: act = self.eps_greedy(obs[np.newaxis, :], epsilon) obs_prime, rew, done, _ = self._env.step(act) self._env.render() if done: obs = self._env.reset() print('Done!') time.sleep(1) else: obs = obs_prime
def start_experiment(**args): log, tf_sess, ldir = get_experiment_environment(**args) make_env = partial(make_env_all_params, add_monitor=True, args=args, logdir=ldir) trainer = Trainer(make_env=make_env, num_timesteps=args['num_timesteps'], hps=args, envs_per_process=args['envs_per_process']) with log, tf_sess: logdir = logger.get_dir() print("results will be saved to ", logdir) with open(os.path.join(logdir, 'run_config.yaml'), 'w') as outfile: yaml.dump(args, outfile, default_flow_style=False) trainer.train()
def save_act(self, path=None): """Save model to a pickle located at `path`""" if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)
def run(v): np.random.seed(v['seed']) env_maker = EnvMaker('Pendulum-v0') env = env_maker.make() policy = GaussianMLPPolicy( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=chainer.functions.tanh, ) if v['baseline'] == 'mlp': baseline = MLPBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=chainer.functions.tanh, ) elif v['baseline'] == 'time_dependent': baseline = TimeDependentBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, ) elif v['baseline'] == 'linear_feature': baseline = LinearFeatureBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, ) else: raise ValueError trpo( env=env, env_maker=env_maker, n_envs=16, policy=policy, baseline=baseline, batch_size=10000, n_iters=100, snapshot_saver=SnapshotSaver(logger.get_dir()), )
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) # alg_kwargs['checkpoint_path'] = args.save_path env = build_env(args) # if args.save_video_interval != 0: # env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) # Run PPO LEARN if args.evaluate is True: eval_env = build_env(args) if args.save_video_interval != 0: eval_env = VecVideoRecorder(eval_env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args .save_video_interval == 0, video_length=args.save_video_length) else: eval_env = None model = learn(env=env, eval_env=eval_env, seed=seed, total_timesteps=total_timesteps, **alg_kwargs) return model, env
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, env_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None, initializer=None, force_dummy=False): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank, initializer=None): return lambda: make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations= flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, env_kwargs=env_kwargs, logger_dir=logger_dir, initializer=initializer) set_global_seeds(seed) return DummyVecEnv([ make_thunk(i + start_index, initializer=None) for i in range(num_env) ])
def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) if self.hps['ckptpath'] is not None: self.agent.restore_model(logdir=self.hps['ckptpath'], exp_name=self.hps['exp_name']) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if info['update']['n_updates'] % 60 == 0: self.agent.save_model( logdir=logger.get_dir(), exp_name=self.hps['exp_name'], global_step=info['update']['n_updates']) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def run(env_id, seed, noise_type, **kwargs): # Create envs. env = gym.make(env_id) # Parse noise_type action_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) if seed > 0: np.random.seed(seed) torch.manual_seed(seed) random.seed(seed) env.seed(seed) torch.cuda.manual_seed(seed) start_time = time.time() train(env=env, action_noise=action_noise, memory=memory, **kwargs) env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))
def train(self, args): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) sess = tf.get_default_session() self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) checkdir = osp.join(logger.get_dir(), 'checkpoints', args['env'] + '-' + args['policy']) os.makedirs(checkdir, exist_ok=True) load_weights = args['load_weights'] start_nupdates = 0 if load_weights is not None: load_path = osp.join(checkdir, load_weights) start_nupdates = int(load_weights) print('Loading checkpoint from %s ' % load_weights) self.load(load_path) while True: info = self.agent.step() if info['update']: print('task = ', args['env'], ' num_env = ', args['num_env'], ' policy = ', args['policy']) info['update']['n_updates'] += start_nupdates info['update']['tcount'] += start_nupdates * args[ 'nsteps_per_env'] * args['num_env'] logger.logkvs(info['update']) logger.dumpkvs() print('Time elapsed ' + str( datetime.timedelta(seconds=info['update']['total_secs']))) if info['update']['n_updates'] % 10 == 0 or info['update'][ 'n_updates'] == 1: weights_index = info['update']['n_updates'] savepath = osp.join(checkdir, '%.5i' % weights_index) print('Saving to', savepath) self.save(savepath) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def eval_model(self, ep_num): for i in range(self.num_episodes): ep_images = [] ob = self.env.reset() ob = np.array(ob) eprews = [] if i == 0: ep_images.append(self.env.last_observation) for step in range(900): action, vpred, nlp = self.policy.get_ac_value_nlp_eval(ob) ob, rew, done, info = self.env.step(action[0]) if i == 0: ep_images.append(self.env.last_observation) if rew is None: eprews.append(0) else: eprews.append(rew) if done: print("Episode finished after {} timesteps".format(step+1)) print("Episode Reward is {}".format(sum(eprews))) break #dirname = os.path.abspath(os.path.dirname(__file__)) dirname = logger.get_dir() image_folder = 'images' image_path = os.path.join(dirname,image_folder) os.makedirs(image_path, exist_ok=True) # succeeds even if directory exists. vid_file = os.path.join(image_path, self.exp_name +"_{}_{}_".format(ep_num, i) + ".avi") out = cv2.VideoWriter(vid_file,cv2.VideoWriter_fourcc(*'DIVX'), 15, (64,64)) for j in range(len(ep_images)): image_file = os.path.join(dirname, image_folder, self.exp_name +"_{}_{}_{}_".format(ep_num, i, j) + ".png") print('--EPLEN--',len(ep_images)) #assert 1==2 #print(ep_images[j]) #assert 1==2 tmp = cv2.cvtColor(ep_images[j], cv2.COLOR_RGB2BGR) out.write(tmp) #cv2.imwrite(image_file, ep_images[j]) out.release() print("Episode {} cumulative reward: {}".format(i, sum(eprews)))
def train(self, args): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) sess = tf.get_default_session() self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) checkdir = osp.join(logger.get_dir(), 'checkpoints', args['env']) os.makedirs(checkdir, exist_ok=True) load_weights = args['load_weights'] start_nupdates = 0 if load_weights is not None: load_path = osp.join(checkdir, load_weights) start_nupdates = int(load_weights) print('Loading checkpoint from %s ' % load_weights) self.load(load_path) while True: info = self.agent.step() if info['update']: info['update']['n_updates'] += start_nupdates info['update']['tcount'] += start_nupdates*args['nsteps_per_seg']*args['envs_per_process'] logger.logkvs(info['update']) logger.dumpkvs() print('eplen = ', info['update']['eplen']) if info['update']['n_updates'] % 10 == 0 or info['update']['n_updates']==1: weights_index = info['update']['n_updates'] savepath = osp.join(checkdir, '%.5i'% weights_index) print('Saving to', savepath) self.save(savepath) if info['update']['eplen'] < 950.0: print('final eplen = ', info['update']['eplen']) break if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) sess = tf.get_default_session() self.save = functools.partial(save_variables, sess=sess) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if info['update']['n_updates'] % 10 == 0: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % info['update']['n_updates']) print('Saving to', savepath) self.save(savepath) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def main(): """ Run the atari test """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env.action_space.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) model = DQN(env=env, policy_class=CnnPolicy, buffer_size=10000, learning_rate=1e-4, learning_starts=10000, target_network_update_freq=1000, train_freq=4, exploration_final_eps=0.01, exploration_fraction=0.1, prioritized_replay=True, model_path='atari_test_Breakout') model.learn(total_timesteps=args.num_timesteps) env.close()
def run(v): np.random.seed(v['seed']) env_maker = EnvMaker('CartPole-v0') env = env_maker.make() policy = CategoricalMLPPolicy( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec ) baseline = MLPBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec ) trpo( env=env, env_maker=env_maker, n_envs=16, policy=policy, baseline=baseline, batch_size=2000, n_iters=100, snapshot_saver=SnapshotSaver(logger.get_dir()) )
def _thunk(): env = gym.make(args.env_id) env.seed(args.seed) # to make the result more reproducibility env = Monitor(env, logger.get_dir(), allow_early_resets=True) return env
def learn(*, network, env, total_timesteps, starting_positions, env_name, win_percentage=0.5, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) current_starting_position = starting_positions.pop() # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, starting_position=current_starting_position) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, starting_position=current_starting_position) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.time() start_changes = [] reached_goal = [] nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if env_name == "MountainCar-v0": done_obs = obs[masks] # Number of episodes past n_eps = done_obs.shape[0] # Reached goal if pos is > 0.5 n_goal_reached = (done_obs[:, 0] >= 0.5).sum() reached_goal.extend([ done + update * nsteps - nsteps for done in np.where(done_obs[:, 0] >= 0.5)[0] ]) if (n_goal_reached / n_eps) > win_percentage and len(starting_positions) > 0: start_changes.append(update * nsteps) current_starting_position = starting_positions.pop() runner.env.starting_position = current_starting_position if eval_env is not None: eval_runner.env.starting_position = current_starting_position epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('start_changes', "_".join([str(s) for s in start_changes])) logger.logkv('reached_goal', "_".join([str(goal) for goal in reached_goal])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def retraining( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=4, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', noise_type='normal_0.2', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-4, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 #load the initialization policy agent.load_ini(sess, save_path) # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] '''check if the actor initialization policy has been loaded correctly, i.e. equal to directly ouput values in checkpoint files ''' # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0') # print('loaded_weights:', sess.run(loaded_weights)) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) print('action:', action) new_obs, r, done = env.step(action) # time.sleep(0.2) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs epoch_episode_rewards.append(episode_reward) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set, mean_epoch_episode_rewards, color='r', label='Initialization') plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_retrain.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) return agent
def train(self): # params = self.value_fun._params videos = [] contours = [] returns = [] delay_cs = [] fig = None for itr in range(self.max_itr): itr_starttime = time.time() self.value_fun_update() itr_time = time.time() - itr_starttime log = itr % self.log_itr == 0 or itr == self.max_itr - 1 render = (itr % self.render_itr == 0) and self.render if log: rollout_starttime = time.time() average_return, avg_delay_cost, video = rollout( self.env, self.policy, num_rollouts=self.num_rollouts, render=render, iteration=itr, max_path_length=self.max_path_length) rollout_time = time.time() - rollout_starttime if render: # contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr) # contours += [contour] videos += video returns.append(average_return) delay_cs.append(avg_delay_cost) logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) logger.logkv('Average Delayed Costs', avg_delay_cost) logger.logkv('Iteration Time', itr_time) logger.logkv('Policy Rollout Time', rollout_time) logger.dumpkvs() plot_returns(returns) plot_returns(delay_cs, 'delayed_cost') # plot_contour(self.env, self.value_fun, save=True, fig=fig) # if contours and contours[0] is not None: # contours = list(upsample(np.array(contours), 10)) # clip = mpy.ImageSequenceClip(contours, fps=10) # clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir()) if videos: fps = int(4 / getattr(self.env, 'dt', 0.1)) clip = mpy.ImageSequenceClip(videos, fps=fps) clip.write_videofile('%s/learning_progress.mp4' % logger.get_dir()) itr = self.max_itr average_return, avg_delay_cost, final_itr_video = rollout( self.env, self.policy, num_rollouts=2, render=True, iteration=itr, last_max_path_length=self.last_max_path_length, last_iteration=True) final_clip = mpy.ImageSequenceClip(final_itr_video, fps=40) final_clip.write_videofile('%s/final_rollout.mp4' % logger.get_dir()) plt.close()
def build_env(cloth_cfg_path=None, render_path=None, start_state_path=None, num_env=1, seed=1, alg='ddpg'): """Daniel: actually construct the env, using 'vector envs' for parallelism. For now our cloth env can follow the non-atari and non-retro stuff, because I don't think we need a similar kind of 'wrapping' that they do. Note that `VecFrameStack` is needed to stack frames, e.g., in Atari we do 4 frame stacking. Without that, the states would be size (84,84,1). The non-`args` parameters here are for the cloth env. """ #Adi: Need to modify the next section because no 'args' parameter ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 #nenv = args.num_env or ncpu #alg = args.alg #seed = args.seed #env_type, env_id = get_env_type(args) env_type = 'cloth' env_id = 'cloth' if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} #Adi: I don't think we want to make a vector environment for now because it's causing a lot of trouble temporarily.. let's just start with a single non-vec env #env = make_vec_env(env_id, env_type, num_env or 1, seed, # reward_scale=1, # flatten_dict_observations=flatten_dict_observations, # cloth_cfg_path=cloth_cfg_path, # render_path=render_path, # start_state_path=start_state_path) #Adi: I have to directly define a few more variables because we are now making a single environment instead of a vector environment #Adi: These values are subject to change mpi_rank = 0 subrank = 0 reward_scale = 1.0 gamestate = None wrapper_kwargs = None logger_dir = logger.get_dir() env = make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=subrank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir, cloth_cfg_path=cloth_cfg_path, render_path=render_path, start_state_path=start_state_path) if env_type == 'mujoco': env = VecNormalize(env) return env
def main(): env = make_env() set_global_seeds(env, args.seed) agent = PPO(env=env) batch_steps = args.n_envs * args.batch_steps # number of steps per update if args.save_interval and logger.get_dir(): # some saving jobs pass ep_info_buffer = deque(maxlen=100) t_train_start = time.time() n_updates = args.n_steps // batch_steps runner = Runner(env, agent) for update in range(1, n_updates + 1): t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = args.lr # maybe dynamic change clip_range_now = args.clip_range # maybe dynamic change obs, returns, masks, acts, vals, neglogps, advs, rewards, ep_infos = \ runner.run(args.batch_steps, frac) ep_info_buffer.extend(ep_infos) loss_infos = [] idxs = np.arange(batch_steps) for _ in range(args.n_epochs): np.random.shuffle(idxs) for start in range(0, batch_steps, args.minibatch): end = start + args.minibatch mb_idxs = idxs[start:end] minibatch = [ arr[mb_idxs] for arr in [obs, returns, masks, acts, vals, neglogps, advs] ] loss_infos.append( agent.train(lr_now, clip_range_now, *minibatch)) t_now = time.time() time_this_batch = t_now - t_start if update % args.log_interval == 0: ev = float(explained_variance(vals, returns)) logger.logkv('updates', str(update) + '/' + str(n_updates)) logger.logkv('serial_steps', update * args.batch_steps) logger.logkv('total_steps', update * batch_steps) logger.logkv('time', time_this_batch) logger.logkv('fps', int(batch_steps / (t_now - t_start))) logger.logkv('total_time', t_now - t_train_start) logger.logkv("explained_variance", ev) logger.logkv('avg_reward', np.mean([e['r'] for e in ep_info_buffer])) logger.logkv('avg_ep_len', np.mean([e['l'] for e in ep_info_buffer])) logger.logkv('adv_mean', np.mean(returns - vals)) logger.logkv('adv_variance', np.std(returns - vals)**2) loss_infos = np.mean(loss_infos, axis=0) for loss_name, loss_info in zip(agent.loss_names, loss_infos): logger.logkv(loss_name, loss_info) logger.dumpkvs() if args.save_interval and update % args.save_interval == 0 and logger.get_dir( ): pass env.close()
def get_filename(self, i): filename = os.path.join( logger.get_dir(), 'env{}_{}.pk'.format(MPI.COMM_WORLD.Get_rank(), i)) return filename
def main(args): # mpi communicator. comm = MPI.COMM_WORLD rank = comm.Get_rank() # seed. workerseed = args.seed + 10000 * comm.Get_rank() if args.seed is not None else None if workerseed is not None: tc.manual_seed(workerseed % 2 ** 32) np.random.seed(workerseed % 2 ** 32) random.seed(workerseed % 2 ** 32) # logger. if rank == 0: logger.configure() else: logger.configure(format_strs=[]) # env. env = make_atari(args.env_name) env.seed(workerseed) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) print(f"frame_stacking: {args.frame_stacking}") env = wrap_deepmind(env, frame_stack=args.frame_stacking, clip_rewards=(args.mode =='train'), episode_life=(args.mode =='train')) # See Mnih et al., 2015 -> Methods -> Training Details. env.seed(workerseed) # agent. agent = CnnPolicy( img_channels=env.observation_space.shape[-1], num_actions=env.action_space.n, kind=args.model_type) # optimizer and scheduler. max_grad_steps = args.optim_epochs * args.env_steps // (comm.Get_size() * args.optim_batchsize) optimizer = tc.optim.Adam(agent.parameters(), lr=args.optim_stepsize, eps=1e-5) scheduler = tc.optim.lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=args.optim_stepsize, total_steps=max_grad_steps, pct_start=0.0, anneal_strategy='linear', cycle_momentum=False, div_factor=1.0) # checkpoint. if rank == 0: try: state_dict = tc.load(os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')) agent.load_state_dict(state_dict) print(f"Continuing from checkpoint found at {os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')}") except FileNotFoundError: print("Bad checkpoint or none on process 0. Continuing from scratch.") # sync. with tc.no_grad(): for p in agent.parameters(): p_data = p.data.numpy() comm.Bcast(p_data, root=0) p.data.copy_(tc.tensor(p_data).float()) # operations. if args.mode == 'train': learn(env=env, agent=agent, optimizer=optimizer, scheduler=scheduler, comm=comm, timesteps_per_actorbatch=args.timesteps_per_actorbatch, max_timesteps=args.env_steps, optim_epochs=args.optim_epochs, optim_batchsize=args.optim_batchsize, gamma=args.gamma, lam=args.lam, clip_param=args.epsilon, entcoeff=args.ent_coef, checkpoint_dir=args.checkpoint_dir, model_name=args.model_name) env.close() elif args.mode == 'play': if comm.Get_rank() == 0: play(env=env, agent=agent, args=args) env.close() elif args.mode == 'movie': if comm.Get_rank() == 0: movie(env=env, agent=agent, args=args) env.close() else: raise NotImplementedError("Mode of operation not supported!")
def testing(save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape=np.array(nb_actions*[0]).shape nb_features = 2*(env.num_actions+1)+env.num_actions observation_shape=np.array(nb_features*[0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.load(sess,save_path) # sess.graph.finalize() # cannot save sess if its finalized! agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar step_set=[] reward_set=[] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. '''no noise for test''' action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) # print('action:', action) # Execute next action. # if rank == 0 and render: # env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # new_obs, r, env_state,done = env.step(action, env_state) '''actually no need for env_state: in or out''' new_obs, r, done = env.step(action) # print('reward:', r) # note these outputs are batched from vecenv # print('obs: ',obs.shape,obs, 'action: ', action.shape, action ) '''obs shape: (1,17), action shape: (1,6)''' # print('maxaction: ', max_action.shape) '''max_action shape: (6,) , max_action*action shape: (1,6)''' t += 1 # if rank == 0 and render: # env.render() # print('r:', r) episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b=1. agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs # for d in range(len(done)): # if done[d]: # print('done') # # Episode done. # epoch_episode_rewards.append(episode_reward[d]) # episode_rewards_history.append(episode_reward[d]) # epoch_episode_steps.append(episode_step[d]) # episode_reward[d] = 0. # episode_step[d] = 0 # epoch_episodes += 1 # episodes += 1 # if nenvs == 1: # agent.reset() '''added''' epoch_episode_rewards.append(episode_reward) ''' step_set.append(t) reward_set=np.concatenate((reward_set,episode_reward)) # print(step_set,reward_set) # print(t, episode_reward) plt.plot(step_set,reward_set) plt.xlabel('Steps') plt.ylabel('Episode Reward') plt.savefig('ddpg.png') plt.show() ''' episode_reward = np.zeros(nenvs, dtype = np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] '''no training for test''' # for t_train in range(nb_train_steps): # Adapt param noise, if necessary. no noise for test! # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set,mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_test.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saver = tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def learn(seed, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.1, next_n=10, nslupdates=10, seq_len=10, ext_coef=1, int_coef=0.1, K=10): rng = np.random.RandomState(seed) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space loc_space = 2 ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nbatch_sl_train = nenvs * seq_len // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, loc_space=loc_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nbatch_sl_train=nbatch_sl_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, seq_len=seq_len, seed=seed) model = make_model() replay_buffer = Buffer(max_size=1000, seed=seed) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, next_n=next_n, seq_len=seq_len, int_coef=int_coef, ext_coef=ext_coef, replay_buffer=replay_buffer, seed=seed) episode_raw_stats = EpisodeStats(nsteps, nenvs) episode_stats = EpisodeStats(nsteps, nenvs) tfirststart = time.time() nupdates = total_timesteps // nbatch sl_acc = 0 p = 0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 p = update * nbatch / (total_timesteps * 0.875) nbatch_train = nbatch // nminibatches tstart = time.time() obs, locs, goals, raw_rewards, rewards, returns, masks, rnn_masks, actions, values, neglogpacs, states = runner.run( K, p) episode_raw_stats.feed(raw_rewards, masks) episode_stats.feed(rewards, masks) mblossvals = [] assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): rng.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, locs, goals, returns, rnn_masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lr, cliprange, *slices, mbstates)) if nslupdates > 0 and sl_acc < 0.75: sl_acc, sl_loss = sl_train(model, replay_buffer, nslupdates=nslupdates, seq_len=seq_len, nenvs=nenvs, envsperbatch=envsperbatch, lr=lr) elif nslupdates > 0: sl_acc, sl_loss = sl_train(model, replay_buffer, nslupdates=1, seq_len=seq_len, nenvs=nenvs, envsperbatch=envsperbatch, lr=lr) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv('episode_raw_reward', episode_raw_stats.mean_reward()) logger.logkv('imitation_episode_reward', np.mean(runner.recent_imitation_rewards)) logger.logkv('episode_reward', episode_stats.mean_reward()) logger.logkv('episode_success_ratio', np.mean(runner.recent_success_ratio)) logger.logkv('time_elapsed', tnow - tfirststart) if nslupdates > 0: logger.logkv('sl_loss', sl_loss) logger.logkv('sl_acc', sl_acc) logger.logkv('replay_buffer_num', replay_buffer.num_episodes()) logger.logkv('replay_buffer_best', replay_buffer.max_reward()) if noptepochs > 0: for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() print(logger.get_dir()) env.close() return model
def train(self): next_v = 1e6 v = self.value_fun.get_values() itr = 0 videos = [] contours = [] returns = [] fig = None while not self._stop_condition(itr, next_v, v) and itr < self.max_itr: log = itr % self.log_itr == 0 render = (itr % self.render_itr == 0) and self.render if log: next_pi = self.get_next_policy() self.policy.update(next_pi) average_return, video = rollout(self.env, self.policy, render=render, num_rollouts=self.num_rollouts, iteration=itr) if render: contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr) contours += [contour] * len(video) videos += video returns.append(average_return) logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) logger.dumpkvs() next_v = self.get_next_values() self.value_fun.update(next_v) itr += 1 next_pi = self.get_next_policy() self.policy.update(next_pi) contour, fig = plot_contour(self.env, self.value_fun, save=True, fig=fig, iteration=itr) average_return, video = rollout(self.env, self.policy, render=self.render, num_rollouts=self.num_rollouts, iteration=itr) plot_returns(returns) if self.render: videos += video contours += [contour] logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) fps = int(4 / getattr(self.env, 'dt', 0.1)) if contours and contours[0] is not None: clip = mpy.ImageSequenceClip(contours, fps=fps) clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir()) if videos: clip = mpy.ImageSequenceClip(videos, fps=fps) clip.write_videofile('%s/roll_outs.mp4' % logger.get_dir()) plt.close()
b_actions = np.asarray(b_actions) b_values = np.asarray(b_values, dtype=np.float32) b_neglogps = np.asarray(b_neglogps, dtype=np.float32) b_dones = np.asarray(b_dones, dtype=np.bool) last_values = self.agent.get_value(self.obs, self.dones) b_returns = np.zeros_like(b_rewards) b_advs = np.zeros_like(b_rewards) lastgaelam = 0 for t in reversed(range(batch_steps)): if t == batch_steps - 1: mask = 1.0 - self.dones nextvalues = last_values else: mask = 1.0 - b_dones[t + 1] nextvalues = b_values[t + 1] delta = b_rewards[t] + args.gamma * nextvalues * mask - b_values[t] b_advs[ t] = lastgaelam = delta + args.gamma * args.lam * mask * lastgaelam b_returns = b_advs + b_values return (*map(sf01, (b_obs, b_returns, b_dones, b_actions, b_values, b_neglogps, b_advs, b_rewards)), ep_infos) if __name__ == '__main__': os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' logger.configure() main() print('log path: %s' % logger.get_dir())
def learn(policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, restore_path=None, deterministic=False): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) time_str = time.strftime("%m_%m_%H_%M_%S") board_logger = tensorboard_logging.Logger( os.path.join(logger.get_dir(), "tf_board", time_str)) nenvs = 1 #nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, deterministic=deterministic) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if restore_path is not None: model.restore(restore_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch assert (nupdates > 0) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) ids, obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] # Do not train or log if in deterministic mode: if deterministic: continue if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): #np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] mblossvals.append( model.train(lrnow, cliprangenow, ids[mbinds], [obs[i] for i in mbinds], returns[mbinds], masks[mbinds], actions[mbinds], values[mbinds], neglogpacs[mbinds])) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): #np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, [obs[i] for i in mbinds], returns[mbflatinds], masks[mbflatinds], actions[mbflatinds], values[mbflatinds], neglogpacs[mbflatinds], mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: #ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) #logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() board_logger.log_scalar( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]), update) board_logger.flush() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') if not os.path.isdir(checkdir): os.makedirs(checkdir) savepath = osp.join( checkdir, "r" + "{:.2f}".format(safemean([epinfo['r'] for epinfo in epinfobuf])) + '%.5i' % update) print('Saving to', savepath) model.save(savepath) print("Done with training. Exiting.") env.close() return model
def main(): MAX_EPISODES = 2000 LR_A = 0.0005 # learning rate for actor LR_C = 0.0005 # learning rate for critic GAMMA = 0.999 # reward discount REPLACE_ITER_A = 1700 REPLACE_ITER_C = 1500 MEMORY_CAPACITY = 200000 BATCH_SIZE = 32 DISPLAY_THRESHOLD = 400 # display until the running reward > 100 DATA_PATH = './data' LOAD_MODEL = False SAVE_MODEL_ITER = 100000 RENDER = False OUTPUT_GRAPH = False GLOBAL_STEP = tf.Variable(0, trainable=False) INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1)) LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True) LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True) END_POINT = (200 - 10) * (14 / 30) # from game ENV_NAME = 'BipedalWalker-v2' env = gym.make(ENV_NAME) env.seed(1) STATE_DIM = env.observation_space.shape[0] # 24 ACTION_DIM = env.action_space.shape[0] # 4 ACTION_BOUND = env.action_space.high # [1, 1, 1, 1] # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'): S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') sess = tf.Session() # Create actor and critic. actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A, S, S_) critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_, S, S_, R, GLOBAL_STEP) actor.add_grad_to_graph(critic.a_grads, GLOBAL_STEP) M = Memory(MEMORY_CAPACITY) saver = tf.train.Saver(max_to_keep=100) if LOAD_MODEL: all_ckpt = tf.train.get_checkpoint_state( './data', 'checkpoint').all_model_checkpoint_paths saver.restore(sess, all_ckpt[-1]) else: if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH) os.mkdir(DATA_PATH) sess.run(tf.global_variables_initializer()) if OUTPUT_GRAPH: tf.summary.FileWriter('logs', graph=sess.graph) var = 3 # control exploration var_min = 0.01 logger.configure() print(logger.get_dir()) logger.info('start training') for i_episode in range(MAX_EPISODES): # s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.) s = env.reset() ep_r = 0 while True: if RENDER: env.render() a = actor.choose_action(s) a = np.clip( np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration s_, r, done, _ = env.step( a ) # r = total 300+ points up to the far end. If the robot falls, it gets -100. if r == -100: r = -2 ep_r += r transition = np.hstack((s, a, [r], s_)) max_p = np.max(M.tree.tree[-M.tree.capacity:]) M.store(max_p, transition) if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY / 20: var = max([var * 0.9999, var_min]) # decay the action randomness tree_idx, b_M, ISWeights = M.prio_sample( BATCH_SIZE) # for critic update b_s = b_M[:, :STATE_DIM] b_a = b_M[:, STATE_DIM:STATE_DIM + ACTION_DIM] b_r = b_M[:, -STATE_DIM - 1:-STATE_DIM] b_s_ = b_M[:, -STATE_DIM:] abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights) actor.learn(b_s) for i in range(len(tree_idx)): # update priority idx = tree_idx[i] M.update(idx, abs_td[i]) if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0: ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt') save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False) print("Save Model %s\n" % save_path) if done: if "running_r" not in globals(): running_r = ep_r else: running_r = 0.95 * running_r + 0.05 * ep_r # if running_r > DISPLAY_THRESHOLD: RENDER = True # else: RENDER = False # done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----' # print('Episode:', i_episode, # done, # '| Running_r: %i' % int(running_r), # '| Epi_r: %.2f' % ep_r, # '| Exploration: %.3f' % var, # '| Pos: %.i' % int(env.unwrapped.hull.position[0]), # '| LR_A: %.6f' % sess.run(LR_A), # '| LR_C: %.6f' % sess.run(LR_C), # ) logger.record_tabular('episode', i_episode) logger.record_tabular('ep_reward', ep_r) logger.record_tabular('pos', int(env.unwrapped.hull.position[0])) logger.dump_tabular() logger.info('') break s = s_ sess.run(INCREASE_GS)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def testing( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, # **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] # nb_actions = 2*env.grid_size nb_actions = env.grid_size action_shape = np.array(nb_actions * [0]).shape nb_features = (4 + 1) * env.grid_size observation_shape = np.array(nb_features * [0]).shape grid_x = env.grid_x grid_y = env.grid_y x = [] y = [] for i in range(grid_x): x.append(i + 1) for i in range(grid_y): y.append(i + 1) # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. # agent.initialize(sess) # sess.graph.finalize() agent.load(sess, save_path) agent.reset() obs, env_state = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] average_reward = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_state = [] epoch_episodes = 0 #record the car numbers in each step car_num_set = {} t_set = [i for i in range(total_timesteps)] for xx in x: for yy in y: lab = str(xx) + str(yy) car_num_set[lab] = [[0 for i in range(total_timesteps)] for j in range(4)] for epoch in range(nb_epochs): obs, env_state = env.reset() epoch_actions = [] epoch_state = [] average_car_num_set = [] last_action = 1 for cycle in range(nb_epoch_cycles): # Perform rollouts. action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) '''random action''' # if np.random.rand()>0.5: # action=[1] # else: # action=[0] '''cycle light state''' # action=[0] '''cycle action (should cycle state instead of action)''' # if last_action==1: # action=[0] # else: # action=[1] # last_action=action[0] if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): new_obs, r, env_state, done = env.step(action, env_state) epoch_state.append(env_state['11'].light_state) for xx in x: for yy in y: lab = str(xx) + str(yy) for i in range(4): car_num_set[lab][i][t] = ( env_state['11'].car_nums[i]) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: print('done') # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() epoch_episode_rewards.append(episode_reward) average_reward.append(episode_reward / nb_rollout_steps) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # for t_train in range(nb_train_steps): # # Adapt param noise, if necessary. # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # # print('Train!') # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 step_set.append(t) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) # plt.figure(figsize=(8,5)) '''plot rewards-steps''' ax1 = plt.subplot(2, 1, 1) plt.sca(ax1) plt.plot(step_set, average_reward, color='b') # plt.xlabel('Steps') plt.ylabel('Mean Reward', fontsize=12) # plt.ylim(-15000,0) '''plot queueing car numbers-steps''' ax2 = plt.subplot(2, 1, 2) plt.sca(ax2) print(np.shape(t_set), np.shape(car_num_set['11'][i])) for i in range(4): if i == 0: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b') elif i == 1: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='orange') elif i == 2: plt.plot(t_set, car_num_set['11'][i], label=i, color='g') else: plt.plot(t_set, car_num_set['11'][i], label=i, color='r') plt.ylim(0, 100) #sum among roads sum_car_num = np.sum(car_num_set['11'], axis=0) #average among time steps average_car_num = np.average(sum_car_num) average_car_num_set.append(average_car_num) plt.xlabel('Steps', fontsize=12) plt.ylabel('Cars Numbers', fontsize=12) # set legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) leg = plt.legend(by_label.values(), by_label.keys(), loc=1) # leg = plt.legend(loc=4) legfm = leg.get_frame() legfm.set_edgecolor('black') # set legend fame color legfm.set_linewidth(0.5) # set legend fame linewidth plt.savefig('ddpg_mean_test.pdf') plt.show() print(epoch_state) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('average queueing car numbers: ', np.average(average_car_num_set)) return agent