def main(args): self.scene_scope=bathroom_02 self.task_scope=37 #26 43 53 32 41 self.env = Environment({'scene_name': self.scene_scope,'terminal_state_id': int(self.task_scope)}) self.env.reset() Policy = Policy_net('policy', env) #buiding the actor critic graph / object PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph pdb.set_trace() D = Discriminator(env) #discriminator of the Gan Kind of thing
def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space #Policy = Policy_net('policy', env) #Old_Policy = Policy_net('old_policy', env) Policy = Policy_net_quantum('policy', env, 32) Old_Policy = Policy_net_quantum('old_policy', env, 32) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] # do NOT use rewards to update policy rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: next_obs = np.stack([next_obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() break else: obs = next_obs print("Iteration: " + str(iteration)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy #inp = [observations, actions, gaes, d_rewards, v_preds_next] """PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4])""" #writer.add_summary(summary, iteration) writer.close()
def main(args): # init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer args.tradir = args.tradir + '/' + args.env + '/' + args.optimizer # init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer) D = Discriminator(env, args.env, _optimizer=args.optimizer) expert_observations = np.genfromtxt(args.tradir + '/observations.csv') expert_actions = np.genfromtxt(args.tradir + '/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) print('iteration:', iteration, ',rewards:', sum(rewards)) if iteration == (args.iteration - 1): saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): # init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer args.tradir = args.tradir + '/' + args.env + '/' + args.optimizer # init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer) D = Discriminator(env, args.env, _optimizer=args.optimizer) expert_observations = np.genfromtxt(args.tradir + '/observations.csv') expert_actions = np.genfromtxt(args.tradir + '/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) print('iteration:', iteration, ',rewards:', sum(rewards)) if iteration == (args.iteration - 1): saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): env = myTGym(episode_type='0', percent_goal_profit=1, percent_stop_loss=1) obs = env.reset() action_space = np.array([0, 1]) Policy = Policy_net('policy', env, action_space) Old_Policy = Policy_net('old_policy', env, action_space) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) # expert_observations = np.genfromtxt('trajectory/expert_obs/000430.csv', delimiter=',', invalid_raise = False) # expert_actions = np.genfromtxt('trajectory/action_list/actions0-000430-20180503.csv', dtype=np.int32) expert_observations = pd.read_csv('trajectory/expert_obs/000520.csv', index_col=0) expert_actions = pd.read_csv('trajectory/expert_actions/action000520.csv', index_col=0) #print('expert_action: ',expert_actions.shape) expert_actions = expert_actions.replace(2, 0)['0'] saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs #[1, 111] act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) #print(iteration, ' reward: ', reward) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs if iteration % 10 == 0: writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(obs.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator qis reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): scene_scope = 'bathroom_02' task_scope = 26 #26 43 53 32 41 env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) S_Class = SIAMESE() #Creating a siamese class -object Policy = Policy_net( 'policy', S_Class) #buiding the actor critic graph / object , Passing Old_Policy = Policy_net('old_policy', S_Class) #same thing as the other PPO PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph D = Discriminator(S_Class) #discriminator of the Gan Kind of thing ''' batch_n=tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ''' #Loading Expert Data State/Tragets etc expert_observations = np.genfromtxt( 'trajectory/observations.csv') #load expert demnetrations expert_targets = np.genfromtxt('trajectory/targets.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) expert_observations = np.reshape(expert_observations, newshape=[-1, 2048, 4]) expert_targets = np.reshape(expert_targets, newshape=[-1, 2048, 4]) saver = tf.train.Saver( ) #Assign another save if you want to use BC weights if args.restore: #We need a seperate saver only for assigning paramters from BC trained thing saver2 = tf.tran.Saver([ tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy'), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ]) with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run( tf.global_variables_initializer() ) #here already variables get intialized both old policy and new policy net if args.restore: if args.model == '': saver2.restore( sess, args.modeldir + '/' + args.alg + '/' + 'shamane.ckpt') print("Model Reastored") else: saver.restore( sess, args.modeldir + '/' + args.alg + '/' + 'model.ckpt-' + args.model) success_num = 0 #This is use to check whether my agent went to the terminal point #var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for iteration in range( 100000): #args.iteration):#Here start the adversial training print( "Starting ........ The Iteration---------------------------------------------------- :", iteration) observations = [] actions = [] #rewards = [] targets = [] #for the gail v_preds = [] run_policy_steps = 0 while ( True ): #Here what is happenning is , this again samples trajectories from untrain agent run_policy_steps += 1 obs = np.stack([env.s_t]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs #Initial observation target = np.stack([env.s_target]).astype( dtype=np.float32 ) #This is to make sure that input is [batch_size,2048,4] act, v_pred, prob = Policy.act( state=obs, target=target, stochastic=True) # Agents action and values act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) #save the set of observations targets.append(target) actions.append(act) #save the set of actions v_preds.append(v_pred) #next_obs, reward, done, info = env.step(act) #get the next observation and reward acording to the observation next_obs, is_terminal, is_collided = env.step(act) if is_terminal: success_num = success_num + 1 print( "Congratz yoy just reach the terminal state which is:", env.terminal_state_id) if is_collided: print( "Bad Luck your agent just collided couldn't made it to the terminal state which is :", env.terminal_state_id) if (is_terminal or is_collided or (run_policy_steps == 100)): #run one episode till the termination print("Number Of Exploration by the AGENT:", run_policy_steps) v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value #this list use to update the parameters of the calue net print( "Environment is resetting after the collition/Terminal" ) obs = env.reset() #reward = -1 break #with tihs vreak all obsercation ,action and other lists get empty #print(sum(rewards)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) #, iteration) if success_num >= 5000: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break #else: #success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1, 2048, 4]) #collect observations targets = np.reshape(targets, newshape=[-1, 2048, 4]) actions = np.array(actions).astype( dtype=np.int32) #collect the actions # train discriminator #Here comes the Discriminator !! Dis_input = [ expert_observations, expert_targets, expert_actions, observations, targets, actions ] observations.shape[0] expert_observations.shape[0] if observations.shape[0] < expert_observations.shape[0]: High = observations.shape[0] else: High = expert_observations.shape[0] for i in range(100): sample_indices = np.random.randint(low=0, high=High, size=32) sampled_inp_D = [ np.take(a=a, indices=sample_indices, axis=0) for a in Dis_input ] D.train(expert_s=sampled_inp_D[0], expert_t=sampled_inp_D[1], expert_a=sampled_inp_D[2], agent_s=sampled_inp_D[3], agent_t=sampled_inp_D[4], agent_a=sampled_inp_D[5]) ''' D.train(expert_s=expert_observations, expert_t=expert_targets, expert_a=expert_actions, agent_s=observations, agent_t=targets, agent_a=actions) ''' #To get rewards we can use a RNN , then we can get the each time unit output to collect the reward function d_rewards = D.get_rewards( agent_s=observations, agent_t=targets, agent_a=actions ) #how well our agent performed with respect to the expert d_rewards = np.reshape(d_rewards, newshape=[-1]).astype( dtype=np.float32) #rewards for each action pair gaes = PPO.get_gaes( rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next ) #this to calcuate the advantage function in PPO gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) #This is the next value function #train policy inp = [ observations, targets, actions, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters( ) #Assigning policy params means assigning the weights to the default policy nets for epoch in range( 100 ): #This is to train the Agent (Actor Critic ) from the obtaiend agent performances and already trained discriminator sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # Here trainign the policy network PPO.train(state=sampled_inp[0], targets=sampled_inp[1], actions=sampled_inp[2], gaes=sampled_inp[3], rewards=sampled_inp[4], v_preds_next=sampled_inp[5]) summary = PPO.get_summary(obs=inp[0], target=inp[1], actions=inp[2], gaes=inp[3], rewards=inp[4], v_preds_next=inp[5]) writer.add_summary(summary, iteration) writer.close()
def main(args): env = Environment() batch_size = args.batchsize writer = SummaryWriter(args.logdir) logger = ResultLogger(writer) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, logger=logger, args=args) D = Discriminator(env, batch_size, logger=logger, args=args) expert_ds = pd.read_csv(args.expertdir) expert_observations = expert_ds[ utils.observation_field].as_matrix() # 筛选obs特征 expert_actions = utils.merge_to_one_action( expert_ds[utils.action_field].as_matrix()) # 映射action空间,与具体环境相关,这里省略 saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy for episode in range(args.episode): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, sparse_rew, done, info = env.step(act) reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0, run_policy_steps) if done: total_reward = sum(rewards) total_reward /= run_policy_steps total_reward += reward print("[episode]: ", episode) print('[Policy Reward]: ', total_reward) v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = 0 break else: obs = next_obs if episode % 100 == 0: winnum = 0 drawnum = 0 logger.log_result(total_reward, winnum, drawnum, episode) if episode % 1000 == 0: saver.save(sess, args.savedir + '/model.ckpt') observations = np.reshape(observations, newshape=(-1, ob_space)) actions = np.array(actions).astype(dtype=np.int32) # 训练 Discriminator d_rewards = train_discriminator(expert_observations, expert_actions, observations, actions, D, batch_size, episode, logger) # 训练 PPO train_PPO(PPO, observations, actions, d_rewards, v_preds, v_preds_next, batch_size, episode, logger)
def main(args): # prepare log dir if not os.path.exists(args.logdir): os.makedirs(args.logdir) if not os.path.exists(args.savedir): os.makedirs(args.savedir) # gym環境作成 env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space # policy net Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) # ppo学習インスタンス PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # discriminator D = Discriminator(env) # エキスパートtrajectory読み込み expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) # tensorflow saver saver = tf.train.Saver() # session config config = tf.ConfigProto( gpu_options=tf.GPUOptions( visible_device_list=args.gpu_num, allow_growth=True )) # start session with tf.Session(config=config) as sess: # summary writer writer = tf.summary.FileWriter(args.logdir, sess.graph) # Sessionの初期化 sess.run(tf.global_variables_initializer()) # 状態の初期化 obs = env.reset() success_num = 0 # episode loop for iteration in tqdm(range(args.iteration)): # buffer observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 # run episode while True: run_policy_steps += 1 # ネットワーク入力用にobsを変換 obs = np.stack([obs]).astype(dtype=np.float32) # 行動と価値を推定 act, v_pred = Policy.act(obs=obs, stochastic=True) # 要素数が1の配列をスカラーに変換 act = np.asscalar(act) v_pred = np.asscalar(v_pred) # policy netの推定行動で状態の更新 next_obs, reward, done, info = env.step(act) # episodeの各変数を追加 observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # episode終了判定 if done: v_preds_next = v_preds[1:] + [0] obs = env.reset() reward = -1 break else: obs = next_obs # summary追加 writer.add_summary( tf.Summary(value=[tf.Summary.Value( tag='episode_length', simple_value=run_policy_steps)]), iteration) writer.add_summary( tf.Summary(value=[tf.Summary.Value( tag='episode_reward', simple_value=sum(rewards))]), iteration) # episode成功判定 if sum(rewards) >= 195: success_num += 1 # 連続で100回成功していればepisode loopを終了 if success_num >= 100: saver.save(sess, args.savedir+'/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # policy netによるtrajectryをプレースホルダー用に変換 observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) ########################### # GAILの変更点はここだけ # discriminatorでエキスパートの報酬に近づける # discriminator学習 2回 for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # get d_rewards from discrminator d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) # transform d_rewards to numpy for placeholder d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) ########################### # get generalized advantage estimator gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # ppo input data whose rewards is discriminator rewards inp = [observations, actions, gaes, d_rewards, v_preds_next] # assign parameters to old policy PPO.assign_policy_parameters() # train PPO for epoch in range(6): # sample index sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # sampling from input data sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # run ppo PPO.train( obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # get summary summary = PPO.get_summary( obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) # add summary writer.add_summary(summary, iteration) writer.close()
def init_train(args): global writer global sess global Policy global Old_Policy global PPO global Disc global max_iteration global iteration global observation_space global action_space global expert_observations global expert_actions print("###### INITIALIZING ######") max_iteration = args.iteration iteration = 0 # PPO Policy = Policy_net('policy', observation_space) Old_Policy = Policy_net('old_policy', observation_space) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # GAIL Disc = Discriminator(observation_space) # read trajectories expert_observations = [] expert_actions = [] #for data balancing cnt_zero_trj = 0 ZERO_LIMIT = 300 #limit zero trajectory size cnt_left_trj = 0 LEFT_LIMIT = 776 cnt_right_trj = 0 #profiles = [] # center_img, left_img, right_img, wheel_angle, acc, break, speed for _dir in os.listdir(args.trjdir): raw_filename = os.path.join(os.getcwd(), args.trjdir, _dir, 'driving_log.csv') with open(raw_filename) as csvfile: reader = csv.reader(csvfile) for row in reader: # each row is a list if float(row[3]) == 0.0: #check zero(go straght) if cnt_zero_trj <= ZERO_LIMIT: cnt_zero_trj += 1 expert_observations.append( np.squeeze(image_to_feature(row[0]))) expert_actions.append(round(float(row[3]), 2)) elif float(row[3]) < 0.0: #check minus(left turn) if cnt_left_trj <= LEFT_LIMIT: cnt_left_trj += 1 expert_observations.append( np.squeeze(image_to_feature(row[0]))) expert_actions.append(round(float(row[3]), 2)) else: #plus(right turn) cnt_right_trj += 1 expert_observations.append( np.squeeze(image_to_feature(row[0]))) expert_actions.append(round(float(row[3]), 2)) print("###### READ TRAJECTORY: {} ######".format(len(expert_actions))) print("center:{}, left:{}, right:{}".format(cnt_zero_trj, cnt_left_trj, cnt_right_trj)) # import matplotlib.pyplot as plt # plt.hist(expert_actions, bins=20) # plt.ylabel('Probability'); # plt.xlabel('Weight') # plt.show() # return # initialize Tensorflow sess = tf.Session() writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) if os.path.isfile(args.savedir + '/model.ckpt.meta') == True: print("###### LOAD SAVED MODEL !!!!! ######") saver = tf.train.Saver() saver.restore(sess, args.savedir + '/model.ckpt') extract_agent_trajectory()