class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.episodes = 20 self.killer = GracefulKiller() # self.policy = ProximalPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount, # lamb=lamb) self.policy = NoTracePolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) # self.value_func = ValueFunc(self.obs_dim, discount=discount, lamb=1) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): print('fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) # print(observation_samples.shape) self.scaler.update(observation_samples) def normalize_obs(self, obs): scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards} # scale rewards if self.discount < 0.999: rewards = rewards*(1-self.discount) trajectory['values'] = self.value_func.predict(observes) trajectory['mc_return'] = self.discounted_sum(rewards, self.discount) trajectory['td_residual'] = rewards + self.discount*np.append(trajectory['values'][1:],0) - trajectory['values'] trajectory['gae'] = self.discounted_sum(trajectory['td_residual'], self.discount*self.lamb) trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) i += len(trajectories) observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) mc_returns = np.concatenate([t['mc_return'] for t in trajectories]) advantages = np.concatenate([t['td_residual'] for t in trajectories]) # advantages = np.concatenate([t['gae'] for t in trajectories]) # normalize advantage estimates advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) value_func_loss = self.value_func.update(observes, mc_returns) policy_loss, kl, entropy, beta = self.policy.update(observes, actions, advantages) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = env_name self.env = gym.make(env_name) if env_name.startswith('Fetch'): # FetchReach env is a little bit different self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ Collection observations from 5 episodes to initialize Scaler. :return: a properly initialized scaler """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ Transform and update the scaler on the fly. :param obs: Raw observation :return: normalized observation """ scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect a trajectory of (obs, act, reward, obs_next) """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature at normalized observation observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): """ Discounted sum of return or advantage estimates along a trajectory. :param l: a list containing the values of discounted summed interest. :param factor: discount factor in the disc_sum case or discount*lambda for GAE :return: discounted sum of l with regard to factor """ discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ Gather a batch of trajectory samples. :param episodes: size of batch. :return: a batch of samples """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards, 'scaled_rewards': rewards*(1-self.discount)} trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) print('buffer size:', self.buffer.size()) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps # E = len(trajectories) # num_samples = np.sum([len(t['rewards']) for t in trajectories]) gradient_steps = np.sum([len(t['rewards']) for t in trajectories]) if self.env_name.startswith('Fetch'): assert (gradient_steps == 20*50) """train critic""" # train all samples in the buffer, to the extreme # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size()) # train some samples minibatches only critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps) """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """normalize advantage estimates, Crucial step""" advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) """compute control variate""""" cv = self.critic.get_contorl_variate(self.policy, observes, actions) # cv must not be centered # cv = (cv - cv.mean()) / (cv.std() + 1e-6) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages*cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta*cv # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6) """controlled taylor eval term""" ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))]) """policy update""" ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # save training statistics log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['critic_loss'] = critic_loss_mean log['policy_ppo_loss'] = ppo_loss log['policy_ddpg_loss'] = ddpg_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped early if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): """ Load all Function Approximators plus a Scaler. Replaybuffer is not restored though. :param load_from: Dir containing saved weights. """ from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/') self.value_func.load(load_from + 'value_func/') self.critic.load(load_from+'critic/') with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) def demonstrate_agent(self, load_from): """ Simply run the policy without training. :param load_from: :return: """ self.load_model(load_from) while True: observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) if mpi_util.rank == 0: now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) policy = Policy(obs_dim, act_dim, kl_targ) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(policy, scaler, val_func) worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
class Discriminator(object): def __init__(self, obs_dim, act_dim, ent_reg_weight, epochs, input_type, loss_type, logger): self.obs_dim = obs_dim self.act_dim = act_dim self.input_type = input_type self.loss_type = loss_type if self.input_type == 'states_actions': self.input_dim = obs_dim + act_dim elif self.input_type == 'states': self.input_dim = obs_dim self.epochs = epochs # we are only NORMALIZING states for now self.scaler = Scaler(self.obs_dim) # SET LEARNING RATE self.lr_mult = 1.0 self.ent_reg_weight = ent_reg_weight # logger self.logger = logger # creating graph self.g = tf.Graph() with self.g.as_default(): self._placeholders() self._nn_disc() self._loss_train_op() self.init = tf.global_variables_initializer() # session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.49, allow_growth=True) self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True)) self.sess.run(self.init) def _placeholders(self): self.input_ph = tf.placeholder(tf.float32, (None, self.input_dim), name='inputs') self.labels_ph = tf.placeholder(tf.float32, (None, ), name='labels') self.weights_ph = tf.placeholder(tf.float32, (None, ), name='weights') self.lr_ph = tf.placeholder(tf.float32, (), name='learning_rate') def _nn_disc(self): hid1_size = 300 hid2_size = 200 self.lr = 1e-4 ''' hid1_size = self.obs_dim * 10 hid3_size = self.act_dim * 10 hid2_size = int(np.sqrt(hid1_size * hid3_size)) self.lr = 9e-4 / np.sqrt(hid2_size) ''' out = tf.layers.dense(self.input_ph, hid1_size, tf.tanh, kernel_initializer=tf.random_normal_initializer( stddev=np.sqrt(1 / self.obs_dim)), name="h1") out = tf.layers.dense(out, hid2_size, tf.tanh, kernel_initializer=tf.random_normal_initializer( stddev=np.sqrt(1 / hid1_size)), name="h2") ''' out = tf.layers.dense(out, hid3_size, tf.tanh, kernel_initializer=tf.random_normal_initializer( stddev=np.sqrt(1 / hid2_size)), name="h3") ''' scores = tf.layers.dense( out, 1, tf.identity, kernel_initializer=tf.random_normal_initializer( stddev=np.sqrt(1 / hid2_size)), name="scores") self.scores = tf.squeeze(scores) # rewards could be clipped self.reward_op = -tf.log(1 - tf.nn.sigmoid(self.scores)) def _loss_train_op(self): if self.loss_type == 'pure_gail': cross_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=self.scores, labels=self.labels_ph) # this extra entropy penalty is NOT included in the paper # taken from the example provided by the authors # in the paper there is an entropy term for TRPO update??? ent_loss = (1.0 - tf.nn.sigmoid( self.scores)) * self.scores + tf.nn.softplus(-self.scores) self.loss = tf.reduce_mean( (cross_loss - self.ent_reg_weight * ent_loss) * self.weights_ph) train_op = tf.train.AdamOptimizer(learning_rate=self.lr_ph) self.train_min = train_op.minimize(self.loss) elif self.loss_type == 'wasserstein': self.loss = tf.reduce_mean( (self.labels_ph * self.scores + (1.0 - self.labels_ph) * self.scores) * self.weights_ph) train_op = tf.train.AdamOptimizer(learning_rate=self.lr_ph) self.train_min = train_op.minimize(self.loss) def normalize_input(self, inpt): # check out this normalization self.scaler.update(inpt) # i was getting in reverse order scale, offset = self.scaler.get() inpt = (inpt - offset) * scale return inpt def get_rewards(self, gen_obs, gen_acts=None): # those observations are already normalized scale, offset = self.scaler.get() gen_obs = (gen_obs - offset) * scale gen_input = gen_obs if self.input_type == 'states_actions': gen_input = np.concatenate([gen_obs, gen_acts], axis=1) return self.sess.run(self.reward_op, feed_dict={self.input_ph: gen_input}) def update(self, exp_obs, gen_obs): # shuffle generator observations and actions gen_obs = shuffle(gen_obs) obs = np.concatenate([gen_obs, exp_obs], axis=0) obs = self.normalize_input(obs) # number of generator examples gen_num = gen_obs.shape[0] exp_num = exp_obs.shape[0] # create labels and mark real/fake labels = np.zeros((gen_num + exp_num)) labels[gen_num:] = 1.0 # calc loss weight weights = np.zeros((gen_num + exp_num)) weights[:gen_num] = gen_num / (gen_num + exp_num) weights[gen_num:] = exp_num / (gen_num + exp_num) for i in range(self.epochs): inpt, labels, weights = shuffle(obs, labels, weights) bobs = np.array_split(inpt, self.epochs, axis=0) blabs = np.array_split(labels, self.epochs) bweg = np.array_split(weights, self.epochs) for j in range(self.epochs): loss, _ = self.sess.run( [self.loss, self.train_min], feed_dict={ self.input_ph: bobs[i], self.labels_ph: blabs[i], self.weights_ph: bweg[i], self.lr_ph: self.lr * self.lr_mult }) # evaluate the discriminator scores = self.sess.run(self.scores, feed_dict={self.input_ph: obs}) def sigmoid(x): return 1 / (1 + np.exp(-x)) gen_corr = np.sum((sigmoid(scores[:gen_num]) < 0.5)) exp_corr = np.sum((sigmoid(scores[gen_num:]) > 0.5)) gen_acc = gen_corr / gen_num exp_acc = exp_corr / exp_num total_acc = (gen_corr + exp_corr) / (gen_num + exp_num) # log necessary info #self.logger.log('gen_acc', gen_acc) #self.logger.log('exp_acc', exp_acc) #self.logger.log('total_acc', total_acc) return gen_acc, exp_acc, total_acc def close_session(self): self.sess.close()
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(50000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=5) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ 5 episodes empirically determined. :return: """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step( action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ transform and update on the fly. :param obs: :return: """ scale, offset = self.scaler.get() obs_scaled = (obs - offset) * scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [], [], [] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array( rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor * sum + i) sum = factor * sum + i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ gather a batch of samples. :param episodes: :return: """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards } trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps E = len(trajectories) T = trajectories[0]['observes'].shape[0] """train critic""" self.critic.fit( self.policy, self.buffer, epochs=1, num_samples=E * T) # take E*T samples, so in total E*T gradient steps """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([ self.discounted_sum(t['rewards'], self.discount) for t in trajectories ]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['rewards'] + self.discount * np.append( t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """compute control variate""" "" cv = self.critic.get_contorl_variate(self.policy, observes, actions) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages * cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta * cv """controlled taylor eval term""" ctrl_taylor = np.concatenate( [[eta[i] * act] for i, act in enumerate( self.critic.get_taylor_eval(self.policy, observes))]) policy_loss, kl, entropy, beta = self.policy.update( observes, actions, learning_signal, ctrl_taylor) # normalize advantage estimates # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) avg_rewards = np.sum( np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average( [len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format( log['steps'], log['rewards'])) for key in [ 'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss' ]: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12, 9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format( ep_steps, ep_rewards))
class Agent: def __init__(self, env): self.env = env self.sess = None self._config_initialize() def _config_initialize(self): """initialize env config""" Config.data.action_dim = self.env.action_space.shape[0] Config.data.action_bound = self.env.action_space.high Config.data.state_dim = self.env.observation_space.shape[0] self.scaler = Scaler(Config.data.state_dim) def model_fn(self, mode, features, labels): self.mode = mode self.loss, self.train_op, self.training_hooks, self.evaluation_hooks = None, None, None, None self._build_graph() return tf.estimator.EstimatorSpec( mode=mode, loss=self.loss, train_op=self.train_op, training_hooks=self.training_hooks, evaluation_hooks=self.evaluation_hooks) def _build_graph(self): self.actor = Actor() self.critic = Critic() if self.mode == tf.estimator.ModeKeys.TRAIN: ave_ep_reward = tf.placeholder(tf.float32, name='ave_ep_reward') tf.summary.scalar('ave_ep_reward', ave_ep_reward) self.loss = ave_ep_reward global_step = tf.train.get_global_step() self.train_op = tf.assign_add(global_step, 1) self.training_hooks = [TrainingHook(self)] else: self.loss = tf.constant(1) self.evaluation_hooks = [EvalHook(self)] def init_scaler(self, init_episode=5): for e in range(init_episode): self.run_episode() def choose_action(self, observation): feed_dict = {self.actor.states: [observation]} action = self.sess.run(self.actor.sample, feed_dict) return action def eval(self, animate=False): observation = self.env.reset() ep_reward = 0 count = 0 done = False while not done: if animate: self.env.render() action = self.choose_action(self.scaler.normalize(observation)) next_observation, reward, done, info = self.env.step(action) ep_reward += reward observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break return ep_reward def run_episode(self, animate=False): observation = self.env.reset() states, actions, rewards, unscaled_states = [], [], [], [] done = False count = 0 while not done: if animate: self.env.render() unscaled_states.append(observation) observation = self.scaler.normalize(observation) states.append(observation) action = self.choose_action(observation) actions.append(action) next_observation, reward, done, info = self.env.step(action) rewards.append(reward) observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break self.scaler.update(np.array(unscaled_states)) return states, actions, rewards, next_observation, done def cal_target_v(self, done, next_observation, rewards): if done: next_value = 0 else: next_value = \ self.sess.run(self.critic.value, {self.critic.states: [self.scaler.normalize(next_observation)]})[0, 0] target_v = [] for reward in rewards[::-1]: next_value = reward + Config.train.reward_decay * next_value target_v.append([next_value]) target_v.reverse() return target_v def update_actor(self, states, actions, target_v): feed_dict = {self.critic.states: states, self.actor.states: states} value, old_mean, old_stdd = self.sess.run( [self.critic.value, self.actor.mean, self.actor.stdd], feed_dict) advantage = np.array(target_v) - value feed_dict = { self.actor.states: states, self.actor.advantage: advantage, self.actor.old_mean: old_mean, self.actor.old_stdd: old_stdd, self.actor.actions: actions } if Config.train.surrogate_clip: for e in range(Config.train.actor_train_episode): self.sess.run(self.actor.train_op, feed_dict) else: for e in range(Config.train.actor_train_episode): _, kl = self.sess.run([self.actor.train_op, self.actor.kl], feed_dict) if kl > Config.train.kl_target * 4: break if kl > Config.train.kl_target * Config.train.kl_target_beta: Config.train.kl_loss_lam *= Config.train.kl_lam_alpha elif kl < Config.train.kl_target / Config.train.kl_target_beta: Config.train.kl_loss_lam /= Config.train.kl_lam_alpha def update_critic(self, states, target_v): feed_dict = { self.critic.states: states, self.critic.target_v: target_v } for e in range(Config.train.critic_train_episode): self.sess.run(self.critic.train_op, feed_dict)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate, submit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_osim(animate) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories if mpi_util.rank == 0: #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) episode = 0 checkpoint = Checkpoint("saves", now) # restore from checkpoint? if restore_path: (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ) = checkpoint.restore(restore_path) else: policy = Policy(obs_dim, act_dim, kl_targ) val_func = NNValueFunction(obs_dim) scaler = Scaler(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: checkpoint.save(policy, val_func, scaler, episode) if animate: observes, actions, rewards, unscaled_obs = run_episode(env, policy, scaler, animate=animate) exit(0) if submit: # Settings #remote_base = 'http://grader.crowdai.org:1729' remote_base = 'http://grader.crowdai.org:1730' token = 'a83412a94593cae3a491f3ee28ff44e1' client = Client(remote_base) # Create environment observation = client.env_create(token) step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: obs = np.array(observation).astype(np.float32).reshape((1, -1)) print("OBSERVATION TYPE:", type(obs), obs.shape) print(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) obs = (obs - offset) * scale # center and scale observations observes.append(obs) action = policy.sample(obs).astype(np.float32).reshape((-1, 1)) print("ACTION TYPE:", type(action), action.shape) print(action) actions.append(action) [observation, reward, done, info] = client.env_step(action.tolist()) print("step:", step, "reward:", reward) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) step += 1e-3 # increment time step feature if done: print( "================================== RESTARTING =================================" ) observation = client.env_reset() step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature if not observation: break client.submit() exit(0) ###### worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) batch = 0 while episode < num_episodes: if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0: checkpoint.save(policy, val_func, scaler, episode) batch = batch + 1 trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target): self.env = gym.make(env_name) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[0] + 1 # the use of time steps is beneficial self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.killer = GracefulKiller() self.policy = LinearPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount) self.value_func = LinearValueFunc(self.obs_dim, discount=discount) # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): print('fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) # print(observation_samples.shape) self.scaler.update(observation_samples) def normalize_obs(self, obs): scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_epsisode(self, train_policy=True, train_value_func=True, animate=False): obs = self.env.reset() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[0]], axis=1) # add time step feature log = { 'rewards': [], 'policy_loss': [], 'value_func_loss': [], 'entropy': [], 'beta': [], 'kl': [], 'advantage':[] } done = False step = 0 while not done: if animate: self.env.render() action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) step += 1e-3 # print(action) obs_new, reward, done, _ = self.env.step(action) obs_new = obs_new.astype(np.float64).reshape((1, -1)) obs_new = self.normalize_obs(obs_new) obs_new = np.append(obs_new, [[step]], axis=1) # add time step feature if not isinstance(reward, float): reward = np.asscalar(reward) log['rewards'].append(reward) # scale reward if self.discount < 0.999: reward *= (1-self.discount) # TD residual advantage = reward + self.discount * self.value_func.predict(obs_new) - self.value_func.predict(obs) advantage = advantage.astype(np.float64).reshape((1,)) if train_value_func: value_func_loss = self.value_func.update(obs, advantage) if train_policy: policy_loss, kl, entropy, beta = self.policy.update(obs, action, advantage) if train_value_func and train_policy: log['policy_loss'].append(policy_loss) log['kl'].append(kl) log['entropy'].append(entropy) log['beta'].append(beta) log['value_func_loss'].append(value_func_loss) log['advantage'].append(advantage) obs = obs_new return log def run_expr(self): ep_steps = [] ep_rewards = [] for i in range(self.num_iterations): # trace vectors are emptied at the beginning of each episode # get more accurate value_func estimator for _ in range(5): self.value_func.init_trace() self.run_one_epsisode(train_value_func=True, train_policy=False, animate=False) self.policy.init_trace() self.value_func.init_trace() # run (and train) one trajectory log = self.run_one_epsisode(animate=self.animate) # compute statistics such as mean and std log['steps'] = len(log['rewards']) log['rewards'] = np.sum(log['rewards']) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']: log[key + '_mean'] = np.mean(log[key]) log[key + '_std'] = np.std(log[key]) del log[key] # display print('episode: ', i) print('total steps: {0}, episodic rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']: print('{:s}: {:.2g}({:.2g})'.format(key, log[key + '_mean'], log[key + '_std'])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) # save weights self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) plt.savefig(OUTPATH + 'train.png')
class Policy(): def __init__(self, name, obs_dim, act_dim, n_ways, batch_size, log_path, gamma=0.995, lam=0.98, kl_targ=0.003, hid1_mult=10, policy_logvar=1.0): self.name = name self.obs_dim, self.act_dim = obs_dim, act_dim self.n_ways = n_ways self.batch_size = batch_size self.gamma = gamma self.lam = lam self.kl_targ = kl_targ self.hid1_mult = hid1_mult self.policy_logvar = policy_logvar self.logger = Logger(logname=os.path.join(log_path, name), now=datetime.utcnow().strftime("%b_%d_%H_%M_%S")) self.scaler = Scaler(self.obs_dim) self.val_func = NNValueFunction(self.obs_dim, hid1_mult=10) self.trpo_net = TrpoNet(name, self.obs_dim, self.act_dim, n_ways=n_ways, kl_targ=kl_targ, hid1_mult=hid1_mult, policy_logvar=policy_logvar) self.trajectories = [] self.episode = 0 def update_scaler(self, unscaled): self.scaler.update( unscaled) # update running statistics for scaling observations def update(self, unscaled_obs, actions, rewards, env_idx=-1, trainWeight=False): scale, offset = self.scaler.get() scale[-1] = 1.0 offset[-1] = 0.0 observes = (unscaled_obs - offset) * scale trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards, 'unscaled_obs': unscaled_obs } self.trajectories.append(trajectory) if len(self.trajectories) > self.batch_size: unscaled = np.concatenate( [t['unscaled_obs'] for t in self.trajectories]) self.scaler.update( unscaled) # update running statistics for scaling observations self.logger.log({ '_{}_MeanReward'.format(self.name): np.mean([t['rewards'].sum() for t in self.trajectories]), '_{}_steps'.format(self.name): unscaled.shape[0] / self.batch_size }) trajs = copy.deepcopy(self.trajectories) self.trajectories = [] self.episode += len(trajs) self._add_value(trajs, self.val_func) # add estimated values to episodes self._add_disc_sum_rew( trajs, self.gamma) # calculated discounted sum of Rs self._add_gae(trajs, self.gamma, self.lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = self._build_train_set( trajs) self._log_batch_stats(observes, actions, advantages, disc_sum_rew, self.logger, self.episode) self.trpo_net.update(observes, actions, advantages, env_idx, self.logger, trainWeight=trainWeight) # update policy self.val_func.fit(observes, disc_sum_rew, self.logger) # update value function self.logger.write(display=False) def act(self, unscaled_obs): scale, offset = self.scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature #print(self.name,unscaled_obs.shape,len(offset)) obs = (unscaled_obs - offset) * scale action = self.trpo_net.sample(obs).reshape((1, -1)).astype(np.float32) return action def addway(self): self.n_ways += 1 var_dict = self.trpo_net.get_vars() new_pi = TrpoNet(self.name, self.obs_dim, self.act_dim, self.n_ways, self.kl_targ, self.hid1_mult, self.policy_logvar) new_pi.set_vars(var_dict) self.trpo_net.close_sess() self.trpo_net = new_pi gc.collect() def close_session(self): self.val_func.close_sess() self.trpo_net.close_sess() def _discount(self, x, gamma): """ Calculate discounted forward sum of a sequence at each point """ return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] def _add_value(self, trajectories, val_func): """ Adds estimated value to all time steps of all trajectories Args: trajectories: as returned by run_policy() val_func: object with predict() method, takes observations and returns predicted state value Returns: None (mutates trajectories dictionary to add 'values') """ for trajectory in trajectories: observes = trajectory['observes'] values = val_func.predict(observes) trajectory['values'] = values def _add_disc_sum_rew(self, trajectories, gamma): """ Adds discounted sum of rewards to all time steps of all trajectories Args: trajectories: as returned by run_policy() gamma: discount Returns: None (mutates trajectories dictionary to add 'disc_sum_rew') """ for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] disc_sum_rew = self._discount(rewards, gamma) trajectory['disc_sum_rew'] = disc_sum_rew def _add_gae(self, trajectories, gamma, lam): """ Add generalized advantage estimator. https://arxiv.org/pdf/1506.02438.pdf Args: trajectories: as returned by run_policy(), must include 'values' key from add_value(). gamma: reward discount lam: lambda (see paper). lam=0 : use TD residuals lam=1 : A = Sum Discounted Rewards - V_hat(s) Returns: None (mutates trajectories dictionary to add 'advantages') """ for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] values = trajectory['values'] # temporal differences tds = rewards - values + np.append(values[1:] * gamma, 0) advantages = self._discount(tds, gamma * lam) trajectory['advantages'] = advantages def _build_train_set(self, trajectories): """ Args: trajectories: trajectories after processing by add_disc_sum_rew(), add_value(), and add_gae() Returns: 4-tuple of NumPy arrays observes: shape = (N, obs_dim) actions: shape = (N, act_dim) advantages: shape = (N,) disc_sum_rew: shape = (N,) """ observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate( [t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) # normalize advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def _log_batch_stats(self, observes, actions, advantages, disc_sum_rew, logger, episode): """ Log various batch statistics """ logger.log({ '_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode })
class Agent: #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to #class member variables. Before that is done, all instances of Agent must use the same values for the following: #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit def __init__(self, stateDim: int, actionDim: int, actionMin: np.array, actionMax: np.array, learningRate=0.0005, gamma=0.99, GAElambda=0.95, PPOepsilon=0.2, PPOentropyLossWeight=0, nHidden: int = 2, nUnitsPerLayer: int = 128, mode="PPO-CMA-m", activation="lrelu", H: int = 9, entropyLossWeight: float = 0, sdLowLimit=0.01, useScaler: bool = True, criticTimestepScale=0.001): #Create policy network print("Creating policy") self.actionMin = actionMin.copy() self.actionMax = actionMax.copy() self.actionDim = actionDim self.stateDim = stateDim self.useScaler = useScaler if useScaler: self.scaler = Scaler(stateDim) self.scalerInitialized = False self.normalizeAdvantages = True self.gamma = gamma self.GAElambda = GAElambda self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale #with gamma==0, no need for this piEpsilon = None nHistory = 1 negativeAdvantageAvoidanceSigma = 0 if mode == "PPO-CMA" or mode == "PPO-CMA-m": usePPOLoss = False #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = True self.reluAdvantages = True if mode == "PPO-CMA" else False nHistory = H #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations useSigmaSoftClip = True negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0 elif mode == "PPO": usePPOLoss = True #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = False # separateSigmaAdapt=False self.reluAdvantages = False useSigmaSoftClip = True piEpsilon = 0 else: raise ("Unknown mode {}".format(mode)) self.policy = Policy( stateDim, actionDim, actionMin, actionMax, entropyLossWeight=PPOentropyLossWeight, networkActivation=activation, networkDepth=nHidden, networkUnits=nUnitsPerLayer, networkSkips=False, learningRate=learningRate, minSigma=sdLowLimit, PPOepsilon=PPOepsilon, usePPOLoss=usePPOLoss, separateVarAdapt=separateVarAdapt, nHistory=nHistory, useSigmaSoftClip=useSigmaSoftClip, piEpsilon=piEpsilon, negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma) #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time. #Thus, we use time step as an additional feature for the critic. #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime print("Creating critic network") self.critic = Critic(stateDim=stateDim + 1, learningRate=learningRate, nHidden=nHidden, networkUnits=nUnitsPerLayer, networkActivation=activation, useSkips=False, lossType="L1") #Experience trajectory buffers for the memorize() and updateWithMemorized() methods self.experienceTrajectories = [] self.currentTrajectory = [] #call this after tensorflow's global variables initializer def init(self, sess: tf.Session, verbose=False): #Pretrain the policy to output the initial Gaussian for all states self.policy.init( sess, 0, 1, 0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim), 0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim), 256, 2000, verbose) #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables def act(self, sess: tf.Session, stateObs: np.array, deterministic=False, clipActionToLimits=True): #Expand a single 1d-observation into a batch of 1 vectors if len(stateObs.shape) == 1: stateObs = np.reshape(stateObs, [1, stateObs.shape[0]]) #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian #that covers the whole action space. #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(), #after we have collected some experience. if self.useScaler and (not self.scalerInitialized): actions = np.random.normal( 0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim), 0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim), size=[stateObs.shape[0], self.actionDim]) if clipActionToLimits: actions = np.clip( actions, np.reshape(self.actionMin, [1, self.actionDim]), np.reshape(self.actionMax, [1, self.actionDim])) return actions else: if self.useScaler: scaledObs = self.scaler.process(stateObs) else: scaledObs = stateObs if deterministic: actions = self.policy.getExpectation(sess, scaledObs) else: actions = self.policy.sample(sess, scaledObs) if clipActionToLimits: actions = np.clip(actions, self.actionMin, self.actionMax) return actions def memorize(self, observation: np.array, action: np.array, reward: float, nextObservation: np.array, done: bool): e = Experience(observation, action, reward, nextObservation, done) self.currentTrajectory.append(e) if done: self.experienceTrajectories.append(self.currentTrajectory) self.currentTrajectory = [] def getAverageActionStdev(self): if self.useScaler and (not self.scalerInitialized): return np.mean(0.5 * (self.actionMax - self.actionMin)) else: return self.policy.usedSigmaSum / (1e-20 + self.policy.usedSigmaSumCounter) #If you call memorize() after each action, you can update the agent with this method. #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead. def updateWithMemorized(self, sess: tf.Session, batchSize: int = 512, nBatches: int = 100, verbose=True, valuesValid=False, timestepsValid=False): self.update(sess, experienceTrajectories=self.experienceTrajectories, batchSize=batchSize, nBatches=nBatches, verbose=verbose, valuesValid=valuesValid, timestepsValid=timestepsValid) averageEpisodeReturn = 0 for t in self.experienceTrajectories: episodeReturn = 0 for e in t: episodeReturn += e.r averageEpisodeReturn += episodeReturn averageEpisodeReturn /= len(self.experienceTrajectories) self.experienceTrajectories = [] self.currentTrajectory = [] return averageEpisodeReturn #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory def update(self, sess: tf.Session, experienceTrajectories, batchSize: int = 512, nBatches: int = 100, verbose=True, valuesValid=False, timestepsValid=False): trajectories = experienceTrajectories #shorthand #Collect all data into linear arrays for training. nTrajectories = len(trajectories) nData = 0 for trajectory in trajectories: nData += len(trajectory) #propagate values backwards along trajectory if not already done if not valuesValid: for i in reversed(range(len(trajectory) - 1)): #value estimates, used for training the critic and estimating advantages trajectory[i].V = trajectory[ i].r + self.gamma * trajectory[i + 1].V #update time steps if not updated if not timestepsValid: for i in range(len(trajectory)): trajectory[i].timeStep = i allStates = np.zeros([nData, self.stateDim]) allActions = np.zeros([nData, self.actionDim]) allValues = np.zeros([nData]) allTimes = np.zeros([nData, 1]) k = 0 for trajectory in trajectories: for e in trajectory: allStates[k, :] = e.s allValues[k] = e.V allActions[k, :] = e.a allTimes[k, 0] = e.timeStep * self.criticTimestepScale k += 1 #Update scalers if self.useScaler: self.scaler.update(allStates) scale, offset = self.scaler.get() self.scalerInitialized = True else: offset = 0 scale = 1 #Scale the observations for training the critic scaledStates = self.scaler.process(allStates) #Train critic def augmentCriticObs(obs: np.array, timeSteps: np.array): return np.concatenate([obs, timeSteps], axis=1) self.critic.train(sess, augmentCriticObs(scaledStates, allTimes), allValues, batchSize, nEpochs=0, nBatches=nBatches, verbose=verbose) #Policy training needs advantages, which depend on the critic we just trained. #We use Generalized Advantage Estimation by Schulman et al. if verbose: print("Estimating advantages...".format(len(trajectories))) for t in trajectories: #query the critic values of all states of this trajectory in one big batch nSteps = len(t) states = np.zeros([nSteps + 1, self.stateDim]) timeSteps = np.zeros([nSteps + 1, 1]) for i in range(nSteps): states[i, :] = t[i].s timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale states[nSteps, :] = t[nSteps - 1].s_next states = (states - offset) * scale values = self.critic.predict(sess, augmentCriticObs(states, timeSteps)) #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter. for step in reversed(range(nSteps - 1)): delta_t = t[step].r + self.gamma * values[step + 1] - values[step] t[step].advantage = delta_t + self.GAElambda * self.gamma * t[ step + 1].advantage #Gather the advantages to linear array and apply ReLU and normalization if needed allAdvantages = np.zeros([nData]) k = 0 for trajectory in trajectories: for e in trajectory: allAdvantages[k] = e.advantage k += 1 if self.reluAdvantages: allAdvantages = np.clip(allAdvantages, 0, np.inf) if self.normalizeAdvantages: aMean = np.mean(allAdvantages) aSd = np.std(allAdvantages) if verbose: print("Advantage mean {}, sd{}".format(aMean, aSd)) allAdvantages /= 1e-10 + aSd #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of #states in the same scale self.policy.train(sess, allStates, allActions, allAdvantages, batchSize, nEpochs=0, nBatches=nBatches, stateOffset=offset, stateScale=scale, verbose=verbose)
class GeneratorAgentPure(object): def __init__(self, env, policy_function, value_function, discriminator, gamma, lam, init_qpos, init_qvel, logger=None): self.env = env self.obs_dim = env.observation_space.shape[0] self.act_dim = env.action_space.shape[0] self.policy = policy_function self.value = value_function self.discriminator = discriminator self.gamma = gamma self.lam = lam self.init_qpos = init_qpos self.init_qvel = init_qvel self.scaler = Scaler(self.obs_dim) # logger self.logger = logger # set scaler's scale and offset by collecting 5 episodes self.collect(timesteps=2048) def discount(self, x, gamma): return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] def get_random(self): idx = np.random.randint(low=0, high=self.init_qpos.shape[1], size=1) return np.squeeze(self.init_qpos[:, idx]), np.squeeze(self.init_qvel[:, idx]) def collect(self, timesteps): trajectories = [] trew_stat = [] scale, offset = self.scaler.get() self.logger.log('scale_offset', [scale, offset]) buffer_time = 0 while buffer_time < timesteps: unscaled_obs, scaled_obs, actions, rewards = [], [], [], [] egocentric = [] done = False obs = self.env.reset() qpos, qvel = self.get_random() # we are setting initial qpos and qvel from expert self.env.set_state(qpos, qvel) timestep = 0 while not done and timestep < 1000: obs = obs.astype(np.float32).reshape(1, -1) unscaled_obs.append(obs) obs = (obs - offset) * scale scaled_obs.append(obs) acts = self.policy.sample(obs) actions.append(acts.astype(np.float32).reshape(1, -1)) obs, rew, done, _ = self.env.step(acts) rewards.append(rew) timestep += 1 buffer_time += 1 # statistics trew_stat.append(np.sum(rewards)) # episode info traj_obs = np.concatenate(scaled_obs) traj_unscaled_obs = np.concatenate(unscaled_obs) traj_acts = np.concatenate(actions) #traj_rews = np.array(rewards, dtype=np.float64) traj_rews = np.squeeze( self.discriminator.get_rewards(traj_unscaled_obs, traj_acts)) # scale rewards using running std of the experiment # traj_scaled_rews = traj_rews * np.squeeze(rew_scale) traj_scaled_rews = traj_rews # calculate discount sum of rewards traj_disc_rews = self.discount(traj_scaled_rews, self.gamma) # calculate advantages traj_values = self.value.predict(traj_obs) deltas = traj_scaled_rews - traj_values + np.append( traj_values[1:] * self.gamma, 0) traj_advantages = self.discount(deltas, self.gamma * self.lam) trajectory = { 'observations': traj_obs, 'actions': traj_acts, 'tdlam': traj_disc_rews, 'advantages': traj_advantages, 'unscaled_obs': traj_unscaled_obs } trajectories.append(trajectory) # update observation scaler uns_obs = np.concatenate([t['unscaled_obs'] for t in trajectories]) self.scaler.update(uns_obs) # update rewards scaler #uns_rews = np.concatenate([t['unscaled_rews'] for t in trajectories]) #self.rew_scaler.update(uns_rews) observations = np.concatenate( [t['observations'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) tdlam = np.concatenate([t['tdlam'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) advantages = (advantages - np.mean(advantages)) / np.std(advantages) # check stats print('mean_trew: %f' % np.mean(trew_stat)) self.logger.log('trew_stat', np.mean(trew_stat)) return observations, uns_obs, actions, tdlam, advantages
class Agent: def __init__(self, env, name, chief=None): assert name == 'chief' or 'worker' in name if 'worker' in name: assert chief is not None self.chief = chief else: self.scaler = Scaler(Config.data.state_dim) self.name = name self.env = env self.sess = None self.coord = None with tf.variable_scope(name): self._build_graph() def _build_graph(self): self.actor = Actor() self.critic = Critic() if 'worker' in self.name: self._build_update_op() def _build_update_op(self): global_step = tf.train.get_global_step() tf.assign_add(global_step, 1, name='global_step_add') with tf.variable_scope('sync'): with tf.variable_scope('pull'): pull_a_params_op = [ actor_param.assign(chief_param) for actor_param, chief_param in zip( self.actor.params, self.chief.actor.params) ] pull_c_params_op = [ critic_param.assign(chief_param) for critic_param, chief_param in zip( self.critic.params, self.chief.critic.params) ] self.pull_op = tf.group(pull_a_params_op + pull_c_params_op) with tf.variable_scope('push'): update_a_op = self.chief.actor.optimizer.apply_gradients( zip(self.actor.grads, self.chief.actor.params)) update_c_op = self.chief.critic.optimizer.apply_gradients( zip(self.critic.grads, self.chief.critic.params)) self.update_op = tf.group([update_a_op, update_c_op]) def init_scaler(self, init_episode=5): for e in range(init_episode): observation = self.env.reset() states = [] done = False count = 0 while not done: states.append(observation) action = self.choose_action(observation) next_observation, reward, done, info = self.env.step(action) observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break self.scaler.update(np.array(states)) def update_chief(self, states, actions, target_v): feed_dict = {self.critic.states: states} value = self.sess.run(self.critic.value, feed_dict) td_error = np.array(target_v) - value feed_dict = { self.critic.states: states, self.critic.target_v: target_v, self.actor.states: states, self.actor.actions: actions, self.actor.td_error: td_error } self.sess.run([ self.critic.loss, self.update_op, self.name + '/global_step_add:0' ], feed_dict) def pull_params(self): self.sess.run(self.pull_op) def cal_target_v(self, done, next_observation, rewards): if done: next_value = 0 else: next_value = self.sess.run( self.critic.value, { self.critic.states: [self.chief.scaler.normalize(next_observation)] })[0, 0] target_v = [] for reward in rewards[::-1]: next_value = reward + Config.train.reward_decay * next_value target_v.append([next_value]) target_v.reverse() return target_v def choose_action(self, observation): if Config.data.action_type == 'discrete': policy = self.sess.run(self.actor.policy, {self.actor.states: [observation]})[0] action = np.random.choice(range(Config.data.action_num), p=policy) else: action = self.sess.run(self.actor.sample, {self.actor.states: [observation]}) return action def eval(self, animate=False): assert self.name == 'chief' observation = self.env.reset() ep_reward = 0 count = 0 done = False while not done: if animate: self.env.render() action = self.choose_action(self.scaler.normalize(observation)) next_observation, reward, done, info = self.env.step(action) ep_reward += reward observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break return ep_reward def work(self): total_step = 0 states, actions, rewards, unscaled_states = [], [], [], [] self.pull_params() while not self.coord.should_stop(): observation = self.env.reset() ep_reward = 0 done = False count = 0 while not done: unscaled_states.append(observation) observation = self.chief.scaler.normalize(observation) states.append(observation) action = self.choose_action(observation) next_observation, reward, done, info = self.env.step(action) total_step += 1 ep_reward += reward actions.append(action) rewards.append(reward) if total_step % Config.train.update_n_iter == 0 or done: target_v = self.cal_target_v(done, next_observation, rewards) self.update_chief(states, actions, target_v) self.chief.scaler.update(np.array(unscaled_states)) states, actions, rewards, unscaled_states = [], [], [], [] self.pull_params() observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break