def _create_graph(self): if self.reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse worker_device = "/job:worker/task:%d" % self.index + self.device with tf.device(tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)): self.results_sum = tf.get_variable(name="results_sum", shape=[], initializer=tf.zeros_initializer) self.game_num = tf.get_variable(name="game_num", shape=[], initializer=tf.zeros_initializer) self.global_steps = tf.get_variable(name="global_steps", shape=[], initializer=tf.zeros_initializer) self.win_rate = self.results_sum / self.game_num self.mean_win_rate = tf.summary.scalar('mean_win_rate_dis', self.results_sum / self.game_num) self.merged = tf.summary.merge([self.mean_win_rate]) mini_scope = "MiniPolicyNN" with tf.variable_scope(mini_scope): ob_space = _SIZE_MINI_INPUT act_space_array = _SIZE_MINI_ACTIONS self.policy = Policy_net('policy', self.sess, ob_space, act_space_array) self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array) self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, lr=P.mini_lr, epoch_num=P.mini_epoch_num) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.policy_saver = tf.train.Saver(var_list=var_list)
def _create_graph(self): if self.reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse worker_device = "/job:worker/task:%d" % self.index + self.device print("worker_device:", worker_device) with tf.device(tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)): self.results_sum = tf.get_variable(trainable=False, name="results_sum", shape=[], initializer=tf.zeros_initializer) self.game_num = tf.get_variable(trainable=False, name="game_num", shape=[], initializer=tf.zeros_initializer) self.global_steps = tf.get_variable(trainable=False, name="global_steps", shape=[], initializer=tf.zeros_initializer) self.mean_win_rate = tf.summary.scalar('mean_win_rate_dis', self.results_sum / self.game_num) self.merged = tf.summary.merge([self.mean_win_rate]) self.dynamic_net = DynamicNetwork('train', self.sess, load_path=self.dynamic_load_path, save_path=self.dynamic_save_path) scope = "PolicyNN" with tf.variable_scope(scope): ob_space = C._SIZE_SIMPLE_INPUT act_space_array = C._SIZE_MAX_ACTIONS self.policy = Policy_net('policy', self.sess, ob_space, act_space_array) self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array) self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, epoch_num=P.src_epoch_num) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) self.policy_saver = tf.train.Saver(var_list=var_list)
def main(args): self.scene_scope=bathroom_02 self.task_scope=37 #26 43 53 32 41 self.env = Environment({'scene_name': self.scene_scope,'terminal_state_id': int(self.task_scope)}) self.env.reset() Policy = Policy_net('policy', env) #buiding the actor critic graph / object PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph pdb.set_trace() D = Discriminator(env) #discriminator of the Gan Kind of thing
def main(args): #env.seed(0) env = gym.make('MineRLNavigateDense-v0') ob_space = env.observation_space action_space = env.action_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 render = False for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs['pov']]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs['pov']) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step( [int(act / 3) + 1, act - int(act / 3) * 3]) if (episode_length % 2500 == 0): print(sum(rewards)) if render: env.render() if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() print('done') break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 1: success_num += 1 render = True if success_num >= 10: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1, 64, 64, 3]) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def init_train(args): global writer global sess global Policy global Old_Policy global PPO global Disc global max_iteration global iteration global observation_space global action_space global expert_observations global expert_actions print("###### INITIALIZING ######") max_iteration = args.iteration iteration = 0 # PPO Policy = Policy_net('policy', observation_space) Old_Policy = Policy_net('old_policy', observation_space) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # GAIL Disc = Discriminator(observation_space) # read trajectories expert_observations = [] expert_actions = [] #for data balancing cnt_zero_trj = 0 ZERO_LIMIT = 300 #limit zero trajectory size cnt_left_trj = 0 LEFT_LIMIT = 776 cnt_right_trj = 0 #profiles = [] # center_img, left_img, right_img, wheel_angle, acc, break, speed for _dir in os.listdir(args.trjdir): raw_filename = os.path.join(os.getcwd(), args.trjdir, _dir, 'driving_log.csv') with open(raw_filename) as csvfile: reader = csv.reader(csvfile) for row in reader: # each row is a list if float(row[3]) == 0.0: #check zero(go straght) if cnt_zero_trj <= ZERO_LIMIT: cnt_zero_trj += 1 expert_observations.append( np.squeeze(image_to_feature(row[0]))) expert_actions.append(round(float(row[3]), 2)) elif float(row[3]) < 0.0: #check minus(left turn) if cnt_left_trj <= LEFT_LIMIT: cnt_left_trj += 1 expert_observations.append( np.squeeze(image_to_feature(row[0]))) expert_actions.append(round(float(row[3]), 2)) else: #plus(right turn) cnt_right_trj += 1 expert_observations.append( np.squeeze(image_to_feature(row[0]))) expert_actions.append(round(float(row[3]), 2)) print("###### READ TRAJECTORY: {} ######".format(len(expert_actions))) print("center:{}, left:{}, right:{}".format(cnt_zero_trj, cnt_left_trj, cnt_right_trj)) # import matplotlib.pyplot as plt # plt.hist(expert_actions, bins=20) # plt.ylabel('Probability'); # plt.xlabel('Weight') # plt.show() # return # initialize Tensorflow sess = tf.Session() writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) if os.path.isfile(args.savedir + '/model.ckpt.meta') == True: print("###### LOAD SAVED MODEL !!!!! ######") saver = tf.train.Saver() saver.restore(sess, args.savedir + '/model.ckpt') extract_agent_trajectory()
class HierNetwork(object): def __init__(self, sess=None, summary_writer=tf.summary.FileWriter("logs/"), rl_training=False, reuse=False, cluster=None, index=0, device='/gpu:0', policy_path=None, ppo_load_path=None, dynamic_load_path=None, ppo_save_path=None, dynamic_save_path=None,): self.system = platform.system() if policy_path is not None: self.policy_model_path_load = policy_path self.policy_model_path_save = policy_path else: self.policy_model_path_load = ppo_load_path + "probe" self.policy_model_path_save = ppo_save_path + "probe" self.dynamic_load_path = dynamic_load_path self.dynamic_save_path = dynamic_save_path self.rl_training = rl_training self.use_norm = True self.reuse = reuse self.sess = sess self.cluster = cluster self.index = index self.device = device self._create_graph() self.rl_saver = tf.train.Saver() self.summary_writer = summary_writer def initialize(self): init_op = tf.global_variables_initializer() self.sess.run(init_op) def reset_old_network(self): self.policy_ppo.assign_policy_parameters() self.policy_ppo.reset_mean_returns() self.sess.run(self.results_sum.assign(0)) self.sess.run(self.game_num.assign(0)) def _create_graph(self): if self.reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse worker_device = "/job:worker/task:%d" % self.index + self.device print("worker_device:", worker_device) with tf.device(tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)): self.results_sum = tf.get_variable(trainable=False, name="results_sum", shape=[], initializer=tf.zeros_initializer) self.game_num = tf.get_variable(trainable=False, name="game_num", shape=[], initializer=tf.zeros_initializer) self.global_steps = tf.get_variable(trainable=False, name="global_steps", shape=[], initializer=tf.zeros_initializer) self.mean_win_rate = tf.summary.scalar('mean_win_rate_dis', self.results_sum / self.game_num) self.merged = tf.summary.merge([self.mean_win_rate]) self.dynamic_net = DynamicNetwork('train', self.sess, load_path=self.dynamic_load_path, save_path=self.dynamic_save_path) scope = "PolicyNN" with tf.variable_scope(scope): ob_space = C._SIZE_SIMPLE_INPUT act_space_array = C._SIZE_MAX_ACTIONS self.policy = Policy_net('policy', self.sess, ob_space, act_space_array) self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array) self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, epoch_num=P.src_epoch_num) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) self.policy_saver = tf.train.Saver(var_list=var_list) def Update_result(self, result_list): self.sess.run(self.results_sum.assign_add(result_list.count(1))) self.sess.run(self.game_num.assign_add(len(result_list))) def Update_summary(self, counter): print("Update summary........") policy_summary = self.policy_ppo.get_summary_dis() self.summary_writer.add_summary(policy_summary, counter) summary = self.sess.run(self.merged) self.summary_writer.add_summary(summary, counter) print("counter:", counter) self.sess.run(self.global_steps.assign(counter)) print("Update summary finished!") def Update_policy(self, buffer): #print('gobal buffer length:', len(buffer.observations)) self.policy_ppo.ppo_train_dis(buffer.observations, buffer.tech_actions, buffer.rewards, buffer.values, buffer.values_next, buffer.gaes, buffer.returns) def Update_internal_model(self, buffer): self.dynamic_net.model_train_dis(buffer.observations, buffer.tech_actions, buffer.next_observations) def get_global_steps(self): return int(self.sess.run(self.global_steps)) def save_policy(self): self.policy_saver.save(self.sess, self.policy_model_path_save) print("policy has been saved in", self.policy_model_path_save) def restore_policy(self): self.policy_saver.restore(self.sess, self.policy_model_path_load) print("Restore policy from", self.policy_model_path_load) def restore_dynamic(self, model_path): self.dynamic_net.restore_sl_model(model_path) print("Restore internal_model")
def main(args): # prepare log dir if not os.path.exists(args.logdir): os.makedirs(args.logdir) if not os.path.exists(args.savedir): os.makedirs(args.savedir) # gym環境作成 env = gym.make("CartPole-v0") env.seed(0) ob_space = env.observation_space # policy net Policy = Policy_net("policy", env) Old_Policy = Policy_net("old_policy", env) # ppo学習インスタンス PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # tensorflow saver saver = tf.train.Saver() # session config config = tf.ConfigProto( gpu_options=tf.GPUOptions(visible_device_list=args.gpu_num, allow_growth=True) ) # start session with tf.Session(config=config) as sess: # summary writer writer = tf.summary.FileWriter(args.logdir, sess.graph) # Sessionの初期化 sess.run(tf.global_variables_initializer()) # 状態の初期化 obs = env.reset() # episodeの成功回数 success_num = 0 # episode loop for iteration in tqdm(range(args.iteration)): # episodeのtrajectory配列 # buffer observations = [] actions = [] v_preds = [] rewards = [] # episodeのstep回数 episode_length = 0 # run episode while True: episode_length += 1 # プレースホルダー用に変換 obs = np.stack([obs]).astype(dtype=np.float32) # 行動と状態価値を推定 act, v_pred = Policy.act(obs=obs, stochastic=True) # 要素数が1の配列をスカラーに変換 act = np.asscalar(act) v_pred = np.asscalar(v_pred) # policyによる行動で状態を更新 next_obs, reward, done, info = env.step(act) # episodeの各変数を追加 # (s_t, a_t, v_t, r_t) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # episode終了判定 # episodeが終了していたら次のepisodeを開始 if done: # v_t+1の配列 v_preds_next = v_preds[1:] + [0] obs = env.reset() reward = -1 break else: obs = next_obs # summary追加 writer.add_summary( tf.Summary( value=[ tf.Summary.Value( tag="episode_length", simple_value=episode_length ) ] ), iteration, ) writer.add_summary( tf.Summary( value=[ tf.Summary.Value( tag="episode_reward", simple_value=sum(rewards) ) ] ), iteration, ) # episode成功判定 if sum(rewards) >= 195: success_num += 1 # 連続で100回成功していればepisode loopを終了 if success_num >= 100: saver.save(sess, args.savedir + "/model.ckpt") print("Clear!! Model saved.") break else: success_num = 0 # policy netによるtrajectryをプレースホルダー用に変換 observations = np.reshape( observations, newshape=[-1] + list(ob_space.shape) ) actions = np.array(actions).astype(dtype=np.int32) # rewardsをプレースホルダー用に変換 rewards = np.array(rewards).astype(dtype=np.float32) # gaesの取得 gaes = PPO.get_gaes( rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next ) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # エージェントのexperience inp = [observations, actions, gaes, rewards, v_preds_next] # Old_Policyにパラメータを代入 PPO.assign_policy_parameters() # PPOの学習 for epoch in range(6): # 学習データサンプル用のインデックスを取得 sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32 ) # PPO学習データをサンプル sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] PPO.train( obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4], ) # summaryの取得 summary = PPO.get_summary( obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4], ) writer.add_summary(summary, iteration) writer.close()
def main(args): # init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer args.tradir = args.tradir + '/' + args.env + '/' + args.optimizer # init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer) D = Discriminator(env, args.env, _optimizer=args.optimizer) expert_observations = np.genfromtxt(args.tradir + '/observations.csv') expert_actions = np.genfromtxt(args.tradir + '/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) print('iteration:', iteration, ',rewards:', sum(rewards)) if iteration == (args.iteration - 1): saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
class MiniNetwork(object): def __init__(self, sess=None, summary_writer=tf.summary.FileWriter("logs/"), rl_training=False, reuse=False, cluster=None, index=0, device='/gpu:0', ppo_load_path=None, ppo_save_path=None): self.policy_model_path_load = ppo_load_path + "mini" self.policy_model_path_save = ppo_save_path + "mini" self.rl_training = rl_training self.use_norm = True self.reuse = reuse self.sess = sess self.cluster = cluster self.index = index self.device = device self._create_graph() self.rl_saver = tf.train.Saver() self.summary_writer = summary_writer def initialize(self): init_op = tf.global_variables_initializer() self.sess.run(init_op) def reset_old_network(self): self.policy_ppo.assign_policy_parameters() self.policy_ppo.reset_mean_returns() self.sess.run(self.results_sum.assign(0)) self.sess.run(self.game_num.assign(0)) def _create_graph(self): if self.reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse worker_device = "/job:worker/task:%d" % self.index + self.device with tf.device( tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)): self.results_sum = tf.get_variable( name="results_sum", shape=[], initializer=tf.zeros_initializer) self.game_num = tf.get_variable(name="game_num", shape=[], initializer=tf.zeros_initializer) self.global_steps = tf.get_variable( name="global_steps", shape=[], initializer=tf.zeros_initializer) self.win_rate = self.results_sum / self.game_num self.mean_win_rate = tf.summary.scalar( 'mean_win_rate_dis', self.results_sum / self.game_num) self.merged = tf.summary.merge([self.mean_win_rate]) mini_scope = "MiniPolicyNN" with tf.variable_scope(mini_scope): ob_space = _SIZE_MINI_INPUT act_space_array = _SIZE_MINI_ACTIONS self.policy = Policy_net('policy', self.sess, ob_space, act_space_array) self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array) self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, lr=P.mini_lr, epoch_num=P.mini_epoch_num) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.policy_saver = tf.train.Saver(var_list=var_list) def Update_result(self, result_list): win = 0 for i in result_list: if i > 0: win += 1 self.sess.run(self.results_sum.assign_add(win)) self.sess.run(self.game_num.assign_add(len(result_list))) def Update_summary(self, counter): print("Update summary........") policy_summary = self.policy_ppo.get_summary_dis() self.summary_writer.add_summary(policy_summary, counter) summary = self.sess.run(self.merged) self.summary_writer.add_summary(summary, counter) self.sess.run(self.global_steps.assign(counter)) print("Update summary finished!") steps = int(self.sess.run(self.global_steps)) win_game = int(self.sess.run(self.results_sum)) all_game = int(self.sess.run(self.game_num)) win_rate = win_game / float(all_game) return steps, win_rate def get_win_rate(self): return float(self.sess.run(self.win_rate)) def Update_policy(self, buffer): self.policy_ppo.ppo_train_dis(buffer.observations, buffer.tech_actions, buffer.rewards, buffer.values, buffer.values_next, buffer.gaes, buffer.returns, verbose=False) def get_global_steps(self): return int(self.sess.run(self.global_steps)) def save_policy(self): self.policy_saver.save(self.sess, self.policy_model_path_save) print("policy has been saved in", self.policy_model_path_save) def restore_policy(self): self.policy_saver.restore(self.sess, self.policy_model_path_load) print("Restore policy from", self.policy_model_path_load)
Old_Policy = SNGANPolicy( 'old_policy', obs_shape=obs_shape, batch_size=args.batch_size, decode=True) # Build reinforcement agent >>>>>>> c79cfc48f93b70a6c24e29d063cb881ff88f5fde if args.algo == 'ppo': print('Building PPO Agent') Agent = PPOTrain( Policy, Old_Policy, obs_shape=obs_shape, gamma=args.gamma, c_vf=args.c_vf, c_entropy=args.c_entropy, c_l1=args.c_l1, obs_size=args.obs_size, vf_clip=args.vf_clip, optimizer=args.g_optimizer) elif args.algo == 'trpo': print('Building TRPO Agent') Agent = TRPOTrain( Policy, Old_Policy, obs_shape=obs_shape, gamma=args.gamma, c_vf=args.c_vf, c_entropy=args.c_entropy, c_l1=args.c_l1,
def main(args): # env = gym.make('CartPole-v0') # env.seed(0) env = CustomEnv() ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=gamma) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(log_path, sess.graph) sess.run(tf.global_variables_initializer()) obs, acs, target_video = env.reset() success_num = 0 for iteration in range(iterations): observations = [] actions = [] pred_actions = [] rewards = [] v_preds = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.array([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs acs = np.array([acs]).astype(dtype=np.float32) pred_act, v_pred = Policy.act(obs=obs, acs=acs, stochastic=True) # act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(acs) observations.append(obs) actions.append(acs) pred_actions.append(pred_act) rewards.append(reward) v_preds.append(v_pred) if done: next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs, acs, target_video = env.reset() break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, weight_path + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder # observations = np.reshape(observations, newshape=[-1,] + list(ob_space.shape)) observations = np.array(observations).astype(dtype=np.float32) actions = np.array(actions).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): print(date) energyPolicy_training_data.append("Energy poilcy training") energyPolicy_training_data.append( "Date: " + str(date)) energyPolicy_training_data.append( "Noise type: " + str(args.noise_type)) energyPolicy_training_data.append( "Policy Training max episodes: " + str(args.iteration)) energyPolicy_training_data.append( "Number of iterations the energy model have ben trained: " + str(args.model)) energyPolicy_training_data.append( "PPO gamma: " + str(args.gamma)) energyPolicy_training_data.append( "Do we add noise to sapair for calculating energy " + str(args.sanoise)) energyPolicy_training_data.append( "The noise we add to sapair " + str(args.noise_sigma)) energyPolicy_training_data.append( "h(energy) " + str(args.reward_function)) energyPolicy_training_data.append(" \n\n") env = gym.make('CartPole-v0') Energy = Energy_net('energy', 'CartPole-v0') energy_saver = tf.train.Saver() sapairs = np.genfromtxt('training_data/sapairs.csv') noise_sapairs = np.genfromtxt('training_data/noise_sapairs.csv') with tf.Session() as sess: # writer = tf.summary.FileWriter(args.logdir+'/'+args.alg, sess.graph) sess.run(tf.global_variables_initializer()) if args.model == '': energy_saver.restore( sess, args.modeldir + '/' + args.alg + '/' + args.noise_type + '/' + 'model.ckpt') else: energy_saver.restore( sess, args.modeldir + '/' + args.alg + '/' + args.noise_type + '/' + 'model.ckpt-' + args.model) print("As for model after ", args.model, "training iterations") print("Energy for expert sapairs looks like:", Energy.get_energy(sapairs)) print( "Energy for noise sapairs (not corresponding to the noise trained for Energy) looks like:", Energy.get_energy(noise_sapairs)) energyPolicy_training_data.append( ["As for model after ", args.model, "training iterations"]) energyPolicy_training_data.append( "Energy for expert sapairs looks like:\n" + str(Energy.get_energy(sapairs))) energyPolicy_training_data.append( "Energy for noise sapairs (not corresponding to the noise trained for Energy) looks like:\n" + str(Energy.get_energy(noise_sapairs))) energyPolicy_training_data.append(" \n\n\n\n\n\n\n\n\n") energyPolicy_training_data.append( "Done with reloading Energy. Start RL") # writer.close() open_file_and_save( args.logdir + '/' + args.model + "_iter_" + args.noise_type + '_Policy' + date, energyPolicy_training_data) print("Done with reloading Energy. Start RL") # Start RL env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver() # writer = tf.summary.FileWriter(args.logdir+'/'+args.noise_type, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 alter_reward = 0 success_num = 0 render = False #ep_reward = [] # 用于记录每个trajectory的数据最后做总结 Summary_after_max_episodes_training = [] Trajectory_rewards = [] Trajectory_alter_rewards = [] Trajectory_success_num = 0 # 与success_num一样,只不过这个不会清零,这个用于评估这个energy对于训练的效果 plot_rewards = [] plot_alter_rewards = [] plot_iteration = [] for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] alter_rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) alter_rewards.append(alter_reward) rewards.append(reward) next_obs, reward, done, info = env.step(act) # alter reward sapair = np.append(obs, np.array([[act]]), axis=1) # print("sapair:",sapair) energy = Energy.get_energy(sapair)[0][0] print("Energy for this sapair", energy) if args.sanoise == True: # 定义 gauss noise 的均值和方差 mu, sigma = 0, args.noise_sigma # 一维guass # saNumber = sapairs.shape[0] saShape = sapair.shape[1] # sampleNo = saNumber * saShape # 采样sampleNo个gauss noise noise = np.random.normal(mu, sigma, saShape) noise_sapair = sapair + noise print("noise_sapair:", noise_sapair) # noise_sapairs = np.reshape(noise_sapairs, newshape=[saNumber, saShape]) noise_energy = Energy.get_energy(noise_sapair)[0][0] print("Noise Energy for this sapair", noise_energy) energy = noise_energy if args.reward_function == "-energy": alter_reward = -energy elif args.reward_function == "-energy+1": alter_reward = -energy + 1 elif args.reward_function == "exp(-energy-1)": alter_reward = np.exp(-energy - 1) elif args.reward_function == "exp(-energy)": alter_reward = np.exp(-energy) else: print("No such reward_function") #alter_reward = np.exp(-energy-1) #alter_reward = -energy+1 #alter_reward = reward #alter_reward = -energy # if render: # env.render() # pass if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 alter_reward = -1 break else: obs = next_obs # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]), iteration) # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration) # if sum(rewards) >= 195: # success_num += 1 # Trajectory_success_num +=1 # render = True # if success_num >= 100: # saver.save(sess, args.savedir + '/model.ckpt') # print('Clear!! Model saved.') # break # else: # success_num = 0 sum_rewards = sum(rewards) sum_alter_rewards = sum(alter_rewards) Trajectory_rewards.append(sum_rewards) Trajectory_alter_rewards.append(sum_alter_rewards) #画图 plot_rewards.append(sum_rewards) plot_alter_rewards.append(sum_alter_rewards) plot_iteration.append(iteration) #ep_reward.append(sum(rewards)) # print("Sample done in one traj.") energyPolicy_training_data_for_this_episode = [] energyPolicy_training_data_for_this_episode.append(" ") energyPolicy_training_data_for_this_episode.append( "Trajectory: " + str(iteration)) energyPolicy_training_data_for_this_episode.append( "episode_len: " + str(episode_length)) energyPolicy_training_data_for_this_episode.append( "True rewards: " + str(sum_rewards)) energyPolicy_training_data_for_this_episode.append( "alter_rewards: " + str(sum_alter_rewards)) open_file_and_save( args.logdir + '/' + args.model + "_iter_" + args.noise_type + '_Policy' + date, energyPolicy_training_data_for_this_episode) print() print("Trajectory", iteration, ":") print("episode_len: ", episode_length) print("rewards: ", sum(rewards)) print("alter_rewards: ", sum(alter_rewards)) # gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = PPO.get_gaes(rewards=alter_rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) alter_rewards = np.array(alter_rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, alter_rewards, v_preds_next] # inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) # writer.add_summary(summary, iteration) # writer.close() #开始画图 plt.title('Noise:' + str(args.sanoise)) plt.plot(plot_iteration, plot_rewards, color='red', label='True_rewards') plt.plot(plot_iteration, plot_alter_rewards, color='green', label='alter_rewards') plt.legend() #显示图例 plt.xlabel('Episodes') plt.ylabel('Rewards') plt.show()
def main(args): writer = SummaryWriter(args.logdir) logger = ResultLogger(writer) env = Environment() # 自定义环境 ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, args=args, logger=logger) saver = tf.train.Saver() if args.continue_train: tf.reset_default_graph() tf.train.import_meta_graph(args.continue_meta) with tf.Session() as sess: if args.continue_train: saver.restore(sess, args.continue_modeldir) sess.run(tf.global_variables_initializer()) reward = 0 winnum = 0 drawnum = 0 for episode in range(args.episode): observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 total_reward = 0 obs = env.reset() while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, sparse_rew, done, info = env.step(act) if reward < -1000: reward = -10 reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0, run_policy_steps) # if episode==1: # print(reward) obs = next_obs if done: total_reward = sum(rewards) total_reward /= run_policy_steps total_reward += reward v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value reward = -1 if info == 3: winnum += 1 if info == 2: drawnum += 1 break if episode % 100 == 0: winnum = 0 drawnum = 0 logger.log_result(total_reward, winnum, drawnum, episode) print(episode, total_reward) if episode % 1000 == 0: saver.save(sess, args.savedir + '/model.ckpt') #### ## GAE #### gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # 把list 转成 适应于tf.placeholder 的numpy array observations = np.reshape(observations, newshape=(-1, ob_space)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) logger.log_gaes(gaes.mean(), episode) PPO.log_parameter(observations, actions, gaes, rewards, v_preds_next) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(2): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4])
def main(args): env = Environment() batch_size = args.batchsize writer = SummaryWriter(args.logdir) logger = ResultLogger(writer) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, logger=logger, args=args) D = Discriminator(env, batch_size, logger=logger, args=args) expert_ds = pd.read_csv(args.expertdir) expert_observations = expert_ds[ utils.observation_field].as_matrix() # 筛选obs特征 expert_actions = utils.merge_to_one_action( expert_ds[utils.action_field].as_matrix()) # 映射action空间,与具体环境相关,这里省略 saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy for episode in range(args.episode): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, sparse_rew, done, info = env.step(act) reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0, run_policy_steps) if done: total_reward = sum(rewards) total_reward /= run_policy_steps total_reward += reward print("[episode]: ", episode) print('[Policy Reward]: ', total_reward) v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = 0 break else: obs = next_obs if episode % 100 == 0: winnum = 0 drawnum = 0 logger.log_result(total_reward, winnum, drawnum, episode) if episode % 1000 == 0: saver.save(sess, args.savedir + '/model.ckpt') observations = np.reshape(observations, newshape=(-1, ob_space)) actions = np.array(actions).astype(dtype=np.int32) # 训练 Discriminator d_rewards = train_discriminator(expert_observations, expert_actions, observations, actions, D, batch_size, episode, logger) # 训练 PPO train_PPO(PPO, observations, actions, d_rewards, v_preds, v_preds_next, batch_size, episode, logger)
def main(args): env = gym.make('CartPole-v0') BCPolicy = Policy_net('bcpolicy', env) BC = BehavioralCloning(BCPolicy) Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver(max_to_keep=args.max_to_keep) #实例化一个Saver对象,在训练过程中,定期调用saver.save方法,像文件夹中写入包含当前模型中所有可训练变量的checkpoint文件 saver.save(sess,FLAGG.train_dir,global_step=step) exp_obs = np.genfromtxt('trajectory/observations.csv')[0:exp_len] #exp_len=200 exp_acts = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)[0:exp_len] with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) #指定一个文件用来保存图。格式:tf.summary.FileWritter(path,sess.graph),可以调用其add_summary()方法将训练过程数据保存在filewriter指定的文件中 sess.run(tf.global_variables_initializer()) inp = [exp_obs, exp_acts] #inp[0]就是observations, inp[1]就是actoins for iteration in range(args.iteration): # episode # train for epoch in range(args.epoch_num): # select sample indices in [low, high) sample_indices = np.random.randint(low=0, high=exp_obs.shape[0], size=args.minibatch_size) #函数的作用是,返回一个随机整型数,范围从低(包括)到高(不包括),即[low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data BC.train(obs=sampled_inp[0], actions=sampled_inp[1]) bc_summary = BC.get_summary(obs=inp[0], actions=inp[1]) if (iteration+1) % args.interval == 0: saver.save(sess, args.savedir + '/model.ckpt', global_step=iteration+1) writer.add_summary(bc_summary, iteration) print("Done with BC. Start RL") # Start RL obs = env.reset() ob_space = env.observation_space reward = 0 alter_reward = 0 success_num = 0 render = False ep_reward=[] for iteration in range(5*args.iteration): print("iter:{}".format(iteration)) observations = [] actions = [] v_preds = [] rewards = [] alter_rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) alter_rewards.append(alter_reward) rewards.append(reward) next_obs, reward, done, info = env.step(act) alter_reward = np.log(1/(kl_divergence(obs, BCPolicy, Policy)+0.00001)) #alter_reward = -kl_divergence(obs, BCPolicy, Policy) #alter_reward = kl_divergence(obs, BCPolicy, Policy) #print(alter_reward) if render: #env.render() pass if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 alter_reward = -1 print("episode_len: ",episode_length) break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if sum(rewards) >= 195: success_num += 1 render = True if success_num >= 100: saver.save(sess, args.savedir+'/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 ep_reward.append(sum(rewards)) print("rewards: ",sum(rewards)) print("alter_rewards: ",sum(alter_rewards)) print("Sample done in one traj.") gaes = PPO.get_gaes(rewards=alter_rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) alter_rewards = np.array(alter_rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, alter_rewards, v_preds_next] print("Begin Training") # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) """ summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) """ #writer.add_summary(summary, iteration) writer.close() plt.plot(ep_reward)
def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space #This is the environment for the gym to observe Policy = Policy_net( 'policy', env) #take the environments #this is normal policy class Old_Policy = Policy_net('old_policy', env) #this is for the old policy PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #this is for training saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer() ) #Here all the variabls get initialized obs = env.reset( ) # [position of cart, velocity of cart, angle of pole, rotation rate of pole] Initial observation reward = 0 success_num = 0 for iteration in range(args.iteration): observations = [] #to store observations actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length #Starting to run the episode_length += 1 #episode length is something dynamic obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act( obs=obs, stochastic=True ) #get the action and value prediction (actor and critic network output) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step( act) #get the observation from the environments #The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center. That is done if done: #This is a termination stage #this has all the next state eliements of the episode inputs v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value #after the terminal stage there shouldn;t be a value function obs = env.reset() reward = -1 break else: #here your break the episode obs = next_obs #if the system do not get terminated it will run for ever #After a one episode get terminated writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]) #From this we can learn how long the episode went , iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]) # , iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print( 'Clear!! Model saved.' ) #this is like after this much sucessfull attempts we are confident about the model break else: success_num = 0 gaes = PPO.get_gaes( rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) #this is the advantage function # convert list to numpy array for feeding tf.placeholder observations = np.reshape( observations, newshape=[-1] + list(ob_space.shape)) #observations from the current policy actions = np.array(actions).astype( dtype=np.int32) #actions taken from current policy gaes = np.array(gaes).astype( dtype=np.float32) #generalized advantage enstimation gaes = (gaes - gaes.mean()) / gaes.std() #Normalize it rewards = np.array(rewards).astype( dtype=np.float32) #Extracted rewrds v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters( ) #before updating the new policy we assign current policy parameters to old policy inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): #starting the optimization # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data Randomly take one sample from the training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): # prepare log dir if not os.path.exists(args.logdir): os.makedirs(args.logdir) if not os.path.exists(args.savedir): os.makedirs(args.savedir) # gym環境作成 env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space # policy net Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) # ppo学習インスタンス PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # discriminator D = Discriminator(env) # エキスパートtrajectory読み込み expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) # tensorflow saver saver = tf.train.Saver() # session config config = tf.ConfigProto( gpu_options=tf.GPUOptions( visible_device_list=args.gpu_num, allow_growth=True )) # start session with tf.Session(config=config) as sess: # summary writer writer = tf.summary.FileWriter(args.logdir, sess.graph) # Sessionの初期化 sess.run(tf.global_variables_initializer()) # 状態の初期化 obs = env.reset() success_num = 0 # episode loop for iteration in tqdm(range(args.iteration)): # buffer observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 # run episode while True: run_policy_steps += 1 # ネットワーク入力用にobsを変換 obs = np.stack([obs]).astype(dtype=np.float32) # 行動と価値を推定 act, v_pred = Policy.act(obs=obs, stochastic=True) # 要素数が1の配列をスカラーに変換 act = np.asscalar(act) v_pred = np.asscalar(v_pred) # policy netの推定行動で状態の更新 next_obs, reward, done, info = env.step(act) # episodeの各変数を追加 observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # episode終了判定 if done: v_preds_next = v_preds[1:] + [0] obs = env.reset() reward = -1 break else: obs = next_obs # summary追加 writer.add_summary( tf.Summary(value=[tf.Summary.Value( tag='episode_length', simple_value=run_policy_steps)]), iteration) writer.add_summary( tf.Summary(value=[tf.Summary.Value( tag='episode_reward', simple_value=sum(rewards))]), iteration) # episode成功判定 if sum(rewards) >= 195: success_num += 1 # 連続で100回成功していればepisode loopを終了 if success_num >= 100: saver.save(sess, args.savedir+'/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # policy netによるtrajectryをプレースホルダー用に変換 observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) ########################### # GAILの変更点はここだけ # discriminatorでエキスパートの報酬に近づける # discriminator学習 2回 for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # get d_rewards from discrminator d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) # transform d_rewards to numpy for placeholder d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) ########################### # get generalized advantage estimator gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # ppo input data whose rewards is discriminator rewards inp = [observations, actions, gaes, d_rewards, v_preds_next] # assign parameters to old policy PPO.assign_policy_parameters() # train PPO for epoch in range(6): # sample index sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # sampling from input data sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # run ppo PPO.train( obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # get summary summary = PPO.get_summary( obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) # add summary writer.add_summary(summary, iteration) writer.close()
def main(args): #init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer #init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer, _lr=args.lr) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if iteration == (args.iteration - 1): saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] print('iteration:', iteration, ',rewards:', sum(rewards)) # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): scene_scope = 'bathroom_02' task_scope = 26 #26 43 53 32 41 env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) S_Class = SIAMESE() #Creating a siamese class -object Policy = Policy_net( 'policy', S_Class) #buiding the actor critic graph / object , Passing Old_Policy = Policy_net('old_policy', S_Class) #same thing as the other PPO PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph D = Discriminator(S_Class) #discriminator of the Gan Kind of thing ''' batch_n=tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ''' #Loading Expert Data State/Tragets etc expert_observations = np.genfromtxt( 'trajectory/observations.csv') #load expert demnetrations expert_targets = np.genfromtxt('trajectory/targets.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) expert_observations = np.reshape(expert_observations, newshape=[-1, 2048, 4]) expert_targets = np.reshape(expert_targets, newshape=[-1, 2048, 4]) saver = tf.train.Saver( ) #Assign another save if you want to use BC weights if args.restore: #We need a seperate saver only for assigning paramters from BC trained thing saver2 = tf.tran.Saver([ tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy'), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ]) with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run( tf.global_variables_initializer() ) #here already variables get intialized both old policy and new policy net if args.restore: if args.model == '': saver2.restore( sess, args.modeldir + '/' + args.alg + '/' + 'shamane.ckpt') print("Model Reastored") else: saver.restore( sess, args.modeldir + '/' + args.alg + '/' + 'model.ckpt-' + args.model) success_num = 0 #This is use to check whether my agent went to the terminal point #var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for iteration in range( 100000): #args.iteration):#Here start the adversial training print( "Starting ........ The Iteration---------------------------------------------------- :", iteration) observations = [] actions = [] #rewards = [] targets = [] #for the gail v_preds = [] run_policy_steps = 0 while ( True ): #Here what is happenning is , this again samples trajectories from untrain agent run_policy_steps += 1 obs = np.stack([env.s_t]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs #Initial observation target = np.stack([env.s_target]).astype( dtype=np.float32 ) #This is to make sure that input is [batch_size,2048,4] act, v_pred, prob = Policy.act( state=obs, target=target, stochastic=True) # Agents action and values act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) #save the set of observations targets.append(target) actions.append(act) #save the set of actions v_preds.append(v_pred) #next_obs, reward, done, info = env.step(act) #get the next observation and reward acording to the observation next_obs, is_terminal, is_collided = env.step(act) if is_terminal: success_num = success_num + 1 print( "Congratz yoy just reach the terminal state which is:", env.terminal_state_id) if is_collided: print( "Bad Luck your agent just collided couldn't made it to the terminal state which is :", env.terminal_state_id) if (is_terminal or is_collided or (run_policy_steps == 100)): #run one episode till the termination print("Number Of Exploration by the AGENT:", run_policy_steps) v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value #this list use to update the parameters of the calue net print( "Environment is resetting after the collition/Terminal" ) obs = env.reset() #reward = -1 break #with tihs vreak all obsercation ,action and other lists get empty #print(sum(rewards)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) #, iteration) if success_num >= 5000: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break #else: #success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1, 2048, 4]) #collect observations targets = np.reshape(targets, newshape=[-1, 2048, 4]) actions = np.array(actions).astype( dtype=np.int32) #collect the actions # train discriminator #Here comes the Discriminator !! Dis_input = [ expert_observations, expert_targets, expert_actions, observations, targets, actions ] observations.shape[0] expert_observations.shape[0] if observations.shape[0] < expert_observations.shape[0]: High = observations.shape[0] else: High = expert_observations.shape[0] for i in range(100): sample_indices = np.random.randint(low=0, high=High, size=32) sampled_inp_D = [ np.take(a=a, indices=sample_indices, axis=0) for a in Dis_input ] D.train(expert_s=sampled_inp_D[0], expert_t=sampled_inp_D[1], expert_a=sampled_inp_D[2], agent_s=sampled_inp_D[3], agent_t=sampled_inp_D[4], agent_a=sampled_inp_D[5]) ''' D.train(expert_s=expert_observations, expert_t=expert_targets, expert_a=expert_actions, agent_s=observations, agent_t=targets, agent_a=actions) ''' #To get rewards we can use a RNN , then we can get the each time unit output to collect the reward function d_rewards = D.get_rewards( agent_s=observations, agent_t=targets, agent_a=actions ) #how well our agent performed with respect to the expert d_rewards = np.reshape(d_rewards, newshape=[-1]).astype( dtype=np.float32) #rewards for each action pair gaes = PPO.get_gaes( rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next ) #this to calcuate the advantage function in PPO gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) #This is the next value function #train policy inp = [ observations, targets, actions, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters( ) #Assigning policy params means assigning the weights to the default policy nets for epoch in range( 100 ): #This is to train the Agent (Actor Critic ) from the obtaiend agent performances and already trained discriminator sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # Here trainign the policy network PPO.train(state=sampled_inp[0], targets=sampled_inp[1], actions=sampled_inp[2], gaes=sampled_inp[3], rewards=sampled_inp[4], v_preds_next=sampled_inp[5]) summary = PPO.get_summary(obs=inp[0], target=inp[1], actions=inp[2], gaes=inp[3], rewards=inp[4], v_preds_next=inp[5]) writer.add_summary(summary, iteration) writer.close()
def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space #Policy = Policy_net('policy', env) #Old_Policy = Policy_net('old_policy', env) Policy = Policy_net_quantum('policy', env, 32) Old_Policy = Policy_net_quantum('old_policy', env, 32) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] # do NOT use rewards to update policy rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: next_obs = np.stack([next_obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() break else: obs = next_obs print("Iteration: " + str(iteration)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy #inp = [observations, actions, gaes, d_rewards, v_preds_next] """PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4])""" #writer.add_summary(summary, iteration) writer.close()
def main(args): env = myTGym(episode_type='0', percent_goal_profit=1, percent_stop_loss=1) obs = env.reset() action_space = np.array([0, 1]) Policy = Policy_net('policy', env, action_space) Old_Policy = Policy_net('old_policy', env, action_space) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) # expert_observations = np.genfromtxt('trajectory/expert_obs/000430.csv', delimiter=',', invalid_raise = False) # expert_actions = np.genfromtxt('trajectory/action_list/actions0-000430-20180503.csv', dtype=np.int32) expert_observations = pd.read_csv('trajectory/expert_obs/000520.csv', index_col=0) expert_actions = pd.read_csv('trajectory/expert_actions/action000520.csv', index_col=0) #print('expert_action: ',expert_actions.shape) expert_actions = expert_actions.replace(2, 0)['0'] saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs #[1, 111] act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) #print(iteration, ' reward: ', reward) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs if iteration % 10 == 0: writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(obs.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator qis reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): #init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr)): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr)) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr) if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer #init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer, _lr=args.lr) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if iteration == (args.iteration-1): saver.save(sess, args.savedir+'/model'+str(args.lr)+'.ckpt') print('Clear!! Model saved.') break gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] print('iteration:', iteration, ',rewards:', sum(rewards)) # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver() tl = args.train_level if tl == 'expert': threshold = 195 savedir = 'trained_models/ppo/expert' logdir = 'log/train/ppo/expert/' elif tl == 'med': threshold = 100 savedir = 'trained_models/ppo/med' logdir = 'log/train/ppo/med/' else: print("[run_ppo.py] Error: Unrecognized train level: {}".format(tl)) exit(1) with tf.Session() as sess: writer = tf.summary.FileWriter(logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: next_obs = np.stack([next_obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= threshold: success_num += 1 if success_num >= 100: saver.save(sess, savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 print("Iteration: {}, Rewards: {}".format(iteration, sum(rewards)), end='\r') gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=(-1, ) + ob_space.shape) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()