def __init__( self, config, env, demo_transitions=None ): #we need another file to give the defination of configuration self.sess = tf.InteractiveSession() self.config = config self.generate_memory = Memory( capacity=self.config.generate_memory_size, permanent_data=0) self.expert_memory = Memory(capacity=self.config.expert_memory_size, permanent_data=0) self.add_data_to_genarte_memory(source=demo_transitions) self.add_data_to_expert_memory(source=demo_transitions) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.ob_space = env.observation_space self.gamma = 0.95 self.Policy = Policy_net('policy', env) self.Old_Policy = Policy_net('old_policy', env) self.PPO = PPOTrain(self.Policy, self.Policy, self.gamma) self.D = Discriminator(env) self.epsilon = self.config.INITIAL_EPSILON self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) print("we have initialized the GAIL") self.save_model() self.restore_model()
def train(): num_process = 4 sub = SubprocVecEnv(num_process, False) state_space = 2 action_space = 4 Policy = Policy_net('policy', state_space, action_space) Old_Policy = Policy_net('old_policy', state_space, action_space) PPO = PPOTrain(Policy, Old_Policy, gamma=0.95) with tf.Session() as sess: saver = tf.train.Saver() tf.set_random_seed(1234) #sess.run(tf.global_variables_initializer()) saver.restore(sess, './synch_ubuntu/model') i = 0 #writer = tf.summary.FileWriter('./board/dqn_per', sess.graph) #for i in range(10): while True: i += 1 info = sub.reset() terminal, each_terminal = False, [False] * num_process global_step = 0 memory = [] obs_s, state, action, reward, done = trans_data(info, num_process) while not terminal: time.sleep(0.05) global_step += 1 action, v_pred = get_action(Policy, each_terminal, num_process, state) info = sub.step(action, obs_s, [global_step] * num_process) obs_s, next_state, a, reward, done = trans_data( info, num_process) each_terminal, terminal = check_done(info, num_process) memory.append([state, action, reward, v_pred]) if terminal: state_, action_, reward_, v_preds_next_, gaes_ = memory_stack( memory, num_process, state_space, PPO) PPO.assign_policy_parameters() inp = [state_, action_, reward_, v_preds_next_, gaes_] for epoch in range(3): sample_indices = np.random.randint( low=0, high=state_.shape[0], size=64) sampled_inp = [ np.take(a=t, indices=sample_indices, axis=0) for t in inp ] PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) #summary = sess.run(merged, feed_dict={r:sum(reward_)/(num_process)}) #writer.add_summary(summary, i) #saver.save(sess, './synch_ubuntu/model') if i < 5100: print(sum(reward_) / (num_process), i) state = next_state sub.close()
def train(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=step_mul, game_steps_per_episode=None, disable_fog=False, visualize=True) as env: with tf.Session() as sess: Policy = Policy_net('policy', 2, 4) Old_Policy = Policy_net('old_policy', 2, 4) PPO = PPOTrain(Policy, Old_Policy) D = Discriminator() saver = tf.train.Saver() saver.restore(sess, './model/gail.cpkt') c = 0 for episodes in range(100000): done = False obs = env.reset() while not 331 in obs[0].observation.available_actions: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] rewards = [] v_preds = [] reward = 0 global_step = 0 while not done: global_step += 1 state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) observations.append(state) actions_list.append(act) rewards.append(reward) v_preds.append(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) next_state = obs2state(obs) distance = obs2distance(obs) if distance < 0.03 or global_step == 100: done = True if done: v_preds_next = v_preds[1:] + [0] break state = next_state
def __init__(self, env, config): self.Soulsess = tf.InteractiveSession() self.config = config self.expert_memory = Memory(capacity=self.config.EXPERT_MEMORY_SIZE, permanent_data=0) self.generate_memory = Memory( capacity=self.config.GENERATE_MEMORY_SIZE, permanent_data=0) #self.sess.run(tf.global_variables_initializer()) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.ob_space = env.observation_space self.gamma = 0.95 self.Policy = Policy_net('policy', env) self.Old_Policy = Policy_net('old_policy', env) self.PPO = PPOTrain(self.Policy, self.Old_Policy, self.gamma) self.D = Discriminator(env) self.epsilon = self.config.INITIAL_EPSILON self.saver = tf.train.Saver()
def main(): angle = 0.0 angle_thres_deg = 15 cart = 0.0 t.tic() reward_max = 5 reward_min = -5 reward_disc = 5 pwm_index = 1 pwm_list = [("L", 180), ("L", 170), ("L", 160), ("L", 0), ("R", 160), ("R", 170), ("R", 180)] pwm_list = [("L", 180), ("L", 0), ("R", 180)] pwm_list_size = 3 # Serial port for Arduino if (SERIAL_AVAILABLE): ser = serial.Serial('COM20', 115200) # Initialize serial port print("connected to: " + ser.portstr) # Confirm connection env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: if LOAD: saver.restore( sess, "./model/model_iter_{:d}_rewards_{:d}.ckpt".format( load_iteration, load_rewards)) else: sess.run( tf.global_variables_initializer()) # remove me if loading save writer = tf.summary.FileWriter('./log/train', sess.graph) obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # env.render() if (act == 1): if pwm_index < pwm_list_size - 1: pwm_index += 1 else: if pwm_index > 0: pwm_index -= 1 dir = pwm_list[pwm_index][0] pwm = pwm_list[pwm_index][1] print(dir) print(pwm) if (SERIAL_AVAILABLE): PD.writePWM(ser, 180, dir) last_angle = angle angle_deg = PD.getPEncoderPos( ser ) * 360 / 1200 # convert encoder counts (1200) to degrees angle = angle_deg * 2 * math.pi / 360 # convert degrees to radians angle_velocity = (angle - last_angle) / t.tocvalue() last_cart = cart cart = PD.getMEncoderPos(ser) cart_velocity = (cart - last_cart) / t.tocvalue() #print("Angle {:.1f}, Angle_vel (rad/s) {:.1f}, Position (mm) {:.1f}, Velocity (mm/s) {:.1f}".format(angle, angle_velocity, cart,cart_velocity)) t.tic() m = (reward_max - reward_min) / (reward_disc - angle_thres_deg) # reward = min(m*(abs(angle_deg)-reward_disc) + reward_max, reward_max) #reward = 1 reward = ((.9 / 7) * (min( (6 - abs(angle_deg)), (1))) + 6) + ((0.1 / 6) * (min( (5 - abs((cart / 1000))), (1)) + 5)) # next_obs = [angle angle_velocity cart cart_velocoty] # print(next_obs) next_obs = [angle, angle_velocity, cart, cart_velocity] #print("angle = ", angle_deg) print("x: ", PD.getMEncoderPos(ser)) if abs(angle_deg) > angle_thres_deg: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value print("reward: ", sum(rewards)) obs = env.reset() reward = -1 print("Iteration: ", iteration) print('Waiting to reset') PD.writePWM(ser, 0, dir) if iteration % 10 == 0: saver.save( sess, "./model/model_iter_{:d}_rewards_{:d}.ckpt".format( iteration, sum(rewards))) print('Scoot scoot!! Model saved.') while (angle_deg > 1.5 or angle_deg < -1.5): time.sleep(0.1) angle_deg = PD.getPEncoderPos(ser) * 360 / 1200 print('Entered iteration {:1f}'.format(iteration + 1)) break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, './model/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) if iteration > 0: gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close() if (SERIAL_AVAILABLE): ser.close()
def main(): allrewards = list() env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() name = 'Model_Noise' filename = "data/{n}_{ts:%H_%M_%S}.csv".format(n=name, ts=datetime.now()) with open(filename, "w", 1) as result: result.write("Iteration, Reward \n") with tf.Session() as sess: writer = tf.summary.FileWriter('./log/train', sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length if iteration % 500 == 0: env.render() run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) result.write("{:d},{:2f}\n".format(iteration, sum(rewards))) print("Rewards: {:2f}, Iterations: {:d}".format( sum(rewards), iteration)) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, './model/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close() if iteration % 500 == 0: env.close()
def run_gail(agent, index_gail, env): DG_flag = 1 #env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy_' + str(index_gail), env) Old_Policy = Policy_net('old_policy' + str(index_gail), env) gamma = 0.95 PPO = PPOTrain(Policy, Old_Policy, gamma) D = Discriminator(env, index_gail) if DG_flag: # with open(Config.DEMO_DATA_PATH, 'rb') as f: # demo_transitions = pickle.load(f) # demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.demo_buffer_size)) # assert len(demo_transitions) == Config.demo_buffer_size expert_data = agent.replay_memory if agent.replay_memory.full( ) else agent.demo_memory _, demo_transitions, _ = expert_data.sample(agent.config.BATCH_SIZE) expert_observations = [data[0] for data in demo_transitions] expert_actions = [data[1] for data in demo_transitions] else: expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with tf.Session() as sess: # writer = tf.summary.FileWriter(args.logdir, sess.graph) #load_path=saver.restore(sess,"trained_models/model.ckpt") #sess.run(tf.global_variables_initializer()) #if index_gail>1: # saver.restore(sess, 'trained_models/model' + str(index_gail-1) + '.ckpt') obs = env.reset() state_for_memory = obs #为了处理两套程序中使用的数据格式不同 success_num = 0 iteration = int(2000) #0319 for iteration in range(iteration): #print("running policy ") observations = [] #states_for_memory=[] actions = [] # do NOT use rewards to update policy , # 0319 why ? rewards = [] v_preds = [] run_policy_steps = 0 score = 0 if DG_flag: t_q = deque(maxlen=Config.trajectory_n) done, score, n_step_reward, state_for_memory = False, 0, None, env.reset( ) while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) next_state_for_memory = next_obs score += reward if DG_flag: reward_to_sub = 0. if len(t_q) < t_q.maxlen else t_q[0][ 2] # record the earliest reward for the sub t_q.append([ state_for_memory, act, reward, next_state_for_memory, done, 0.0 ]) if len(t_q) == t_q.maxlen: if n_step_reward is None: # only compute once when t_q first filled n_step_reward = sum([ t[2] * Config.GAMMA**i for i, t in enumerate(t_q) ]) else: n_step_reward = (n_step_reward - reward_to_sub) / Config.GAMMA n_step_reward += reward * Config.GAMMA**( Config.trajectory_n - 1) t_q[0].extend([ n_step_reward, next_state_for_memory, done, t_q.maxlen ]) # actual_n is max_len here #agent.perceive(t_q[0]) # perceive when a transition is completed env.render() # 0313 observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: if DG_flag: t_q.popleft( ) # first transition's n-step is already set transitions = set_n_step(t_q, Config.trajectory_n) next_obs = np.stack([next_obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() print("iteration", iteration, "score", score) break else: obs = next_obs state_for_memory = next_state_for_memory #print("state_for memory",state_for_memory) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration) # if sum(rewards) >= 100: success_num += 1 # todo # 在 能够得到较好的回报 的时候 存储这次的demo if DG_flag: for t in transitions: agent.perceive(t) agent.replay_memory.memory_len() if success_num >= 3: #saver.save(sess, 'trained_models/model.ckpt') #saver.save(sess, 'trained_models/model' + str(index_gail) + '.ckpt') print(success_num) print('Clear!! Model saved.') env.close() break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): #print("training D") D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): #print("updating PPO ") sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4])
def train(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=step_mul, game_steps_per_episode=None, disable_fog=False, visualize=False) as env: r = tf.placeholder(tf.float32) ######## rr = tf.summary.scalar('reward', r) merged = tf.summary.merge_all() ######## expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with tf.Session() as sess: Policy = Policy_net('policy', 2, 4) Old_Policy = Policy_net('old_policy', 2, 4) PPO = PPOTrain(Policy, Old_Policy) D = Discriminator() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() writer = tf.summary.FileWriter('./board/gail', sess.graph) ######## c = 0 for episodes in range(100000): done = False obs = env.reset() while not 331 in obs[0].observation.available_actions: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] rewards = [] v_preds = [] reward = 0 global_step = 0 while not done: global_step += 1 state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) observations.append(state) actions_list.append(act) rewards.append(reward) v_preds.append(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) next_state = obs2state(obs) distance = obs2distance(obs) if distance < 0.03 or global_step == 100: done = True if done: v_preds_next = v_preds[1:] + [0] break state = next_state observations = np.reshape(observations, newshape=[-1, 2]) actions_list = np.array(actions_list).astype(dtype=np.int32) for i in range(2): sample_indices = (np.random.randint( expert_observations.shape[0], size=observations.shape[0])) inp = [expert_observations, expert_actions] sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data D.train(expert_s=sampled_inp[0], expert_a=sampled_inp[1], agent_s=observations, agent_a=actions_list) d_rewards = D.get_rewards(agent_s=observations, agent_a=actions_list) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) inp = [ observations, actions_list, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters() for epoch in range(15): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = sess.run(merged, feed_dict={r: global_step}) writer.add_summary(summary, episodes) if global_step < 50: c += 1 else: c = 0 if c > 10: saver.save(sess, './model/gail.cpkt') print('save model') break print(episodes, global_step, c)
def main(): env = gym.make(ENV) # Instancia o ambiente CartPole env.seed(0) # ob_space = env.observation_space # Descrevem o formato de observações válidas do espaço Policy = Policy_net('policy', env) # Cria a rede de Politica Old_Policy = Policy_net('old_policy', env) # Cria a rede de politica antiga PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() # with tf.Session() as sess: # Bloco da sessão writer = tf.summary.FileWriter('./log/train', sess.graph) # Define diretório de logs sess.run(tf.global_variables_initializer()) # Inicializa as redes obs = env.reset() # Reseta o ambiente e obtêm a primeira observação reward = 0 # Armazena as recompensas success_num = 0 # Contador de sucessos for episode in range(EPISODES): # Loop do episodio observations = [] # Array pra armazenar as observações actions = [] # Array pra armazenar as ações v_preds = [] # Array pra armazenar as previsões rewards = [] # Array pra armazenar as recompensas run_policy_steps = 0 # Contador de passos em cada episodio env.render() # Renderiza o ambiente while True: # Run policy RUN_POLICY_STEPS which is much less than episode length # Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio run_policy_steps += 1 # Incrementa contador de passos de cada episodio obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act( obs=obs, stochastic=True ) # Corre a rede neural e obtêm uma ação e o V previsto act = act.item() # Transforma um array do numpy v_pred = v_pred.item() # em um objeto scalar do Python observations.append( obs) # Adiciona a observação ao buffer de observações actions.append(act) # Adiciona a ação ao buffer de ações v_preds.append(v_pred) # Adiciona a v_pred ao buffer de v_pred rewards.append( reward) # Adiciona a recompensa ao buffer de recompensa next_obs, reward, done, info = env.step( act ) # envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou if done: # Se o done for verdadeiro ... v_preds_next = v_preds[1:] + [ 0 ] # [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista # next state of terminate state has 0 state value # próximo estado do estado final tem 0 valor de estado obs = env.reset() # Redefine o ambiente reward = -1 # define a recompensa como -1 (?) break # Sai do loop while else: # Senão... obs = next_obs # Armazena em obs a próxima observação # Armazena em log para visualização no tensorboard writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), episode) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), episode) # Condicional para finalizar o teste if sum( rewards ) >= 195: # Se a soma das recompensas for maior ou igual 195 success_num += 1 # Incrementa o contador de sucessos if success_num >= 100: # Se ocorrerem 100 sucessos saver.save(sess, './model/model.ckpt') # Salva a sessão print('Clear!! Model saved.') # Escreve na tela break # Sai do loop else: # senão, success_num = 0 # zera o contador de sucessos print("EP: ", episode, " Rw: ", sum(rewards) ) # Escreve na tela o numero do episodio e a recompensa gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # ? # Converte lista em NPArray para alimentar o tf.placeholder newshape = [-1] + list(ob_space.shape) # cria um array [-1, 4] observations = np.reshape( observations, newshape=newshape ) # antes, cada linha de observations era um array idependente. depois do reshape, observations passou ser um array só com varias linhas. actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std( ) # subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrao de gaes PPO.assign_policy_parameters() inp = [ observations, actions, rewards, v_preds_next, gaes ] # Cria um array com 5 colunas: observações, ações, recompensas, # Treina for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # índices estão em [baixo, alto] sampled_inp = [] for a in inp: sampled_inp.append( np.take(a=a, indices=sample_indices, axis=0)) # amostra de dados de treinamento PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, episode) writer.close() # Final do episódio
def main(): env = gym.make('CartPole-v1') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter('./log/test', sess.graph) sess.run(tf.global_variables_initializer()) saver.restore(sess, 'model/model.ckpt') obs = env.reset() reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 env.render() while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=False) act = act.item() v_pred = v_pred.item() observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) # end condition of test if sum(rewards) >= 195: success_num += 1 if success_num >= 100: print('Iteration: ', iteration) print('Clear!!') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() inp = [observations, actions, rewards, v_preds_next, gaes] summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close()
import numpy as np import tensorflow as tf from ppo import PPOTrain from policy_net import Policy_net import threading import time import gym import operator import itertools thread_number = 30 score_list = [] sess = tf.InteractiveSession() Policy = Policy_net(sess, 'policy') Old_Policy = Policy_net(sess, 'old_policy') PPO = PPOTrain(sess, Policy, Old_Policy) sess.run(tf.global_variables_initializer()) def add(number): env = gym.make('CartPole-v0') done = False observations = [] actions = [] v_preds = [] rewards = [] state = env.reset() global_step = 0 while not done:
_SELECT_ALL = [0] env = sc2_env.SC2Env(map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=16, feature_minimap=16, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=4, game_steps_per_episode=None, disable_fog=False, visualize=True) with tf.Session() as sess: Policy = Policy_net('policy') Old_Policy = Policy_net('old_policy') PPO = PPOTrain(Policy, Old_Policy, gamma = 0.95) #sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "4wayBeacon_ppo/tmp/model.ckpt") for episodes in range(1000000): observations = [] actions_list = [] v_preds = [] rewards = [] obs = env.reset() action = actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
def train(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="CollectMineralShards", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32)) as env: Policy = Policy_net('policy', 32*32*2, 4) Old_Policy = Policy_net('old_policy', 32*32*2, 4) PPO = PPOTrain(Policy, Old_Policy, gamma=0.95) saver = tf.train.Saver() with tf.Session() as sess: print('a') saver.restore(sess, './model/model.ckpt') print('a') #sess.run(tf.global_variables_initializer()) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) actions = gather(obs) obs = env.step(actions=[actions]) end_step = 200 global_step = 0 score = 0 reward = 0 for i in range(100): time.sleep(0.01) actions = no_operation(obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] v_preds = [] rewards = [] print('episode start') while not done: global_step += 1 time.sleep(0.05) state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) actions = actAgent2Pysc2(act, obs) #while not 331 in obs[0].observation["available_actions"]: # actions = actAgent2Pysc2(100, obs) # obs = env.step(actions=[actions]) obs = env.step(actions=[actions]) if global_step == end_step or obs2done(obs) >= 1900 : # 게임 time을 다 사용하거나 미네랄을 다 먹었을 경우 게임이 끝난다. done = True next_state = obs2state(obs) reward = obs[0].reward if reward == 0: reward = -0.1 if done: if obs2done(obs) >= 1900: # 게임이 종료되었는데 미네랄을 다 먹었으면 reward = 3 else: # 게임이 종료되었는데 미네랄을 다 못먹으면 reward = -3 score += reward observations.append(state) actions_list.append(act) v_preds.append(v_pred) rewards.append(reward) if done: # 게임 종료시 v_preds_next = v_preds[1:] + [0] gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) observations = np.reshape(observations, newshape=[-1, 32*32*2]) actions = np.array(actions_list).astype(dtype=np.int32) rewards = np.array(rewards).astype(np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] for epoch in range(5): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=64) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) print(episodes, score) save_path = saver.save(sess, './model/model.ckpt') if episodes == 0: f = open('test2.csv', 'w', encoding='utf-8', newline='') else: f = open('test2.csv', 'a', encoding='utf-8', newline='') wr = csv.writer(f) wr.writerow([episodes, score]) f.close() break state = next_state
def __init__(self): rospy.init_node('runPPO', anonymous=True) Policy = Policy_net('policy', self.n_inputs, self.n_outputs) Old_Policy = Policy_net('old_policy', self.n_inputs, self.n_outputs) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA, c_2=0.1) saver = tf.train.Saver() rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) # rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10) gg = Float32MultiArray() gg.data = self.g with tf.Session() as sess: # $ tensorboard --logdir=logs # http://0.0.0.0:6006/ writer = tf.summary.FileWriter( '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/src/PPO/log/train', sess.graph) sess.run(tf.global_variables_initializer()) reward = 0 success_num = 0 episode_count = 0 rate = rospy.Rate(100) # 100hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 # Reset gripper reset_srv() while not self.gripper_closed: rate.sleep() # Get observation obs = np.array(obs_srv().state) self.prev_dis2goal = np.linalg.norm(self.g - obs[:2]) observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 print( '[RL] Step %d in episode %d, distance to goal: %f.' % (run_policy_steps, episode_count, self.prev_dis2goal)) pub_goal.publish(gg) obs = np.stack([obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs while 1: act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) if act < 8: break # Act suc = move_srv(self.A[act]) rospy.sleep(0.05) rate.sleep() if suc: # Get observation next_obs = np.array(obs_srv().state) fail = drop_srv( ).dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr( '[RL] Failed to move gripper. Episode declared failed.' ) fail = True reward, done = self.transition_reward(next_obs, fail) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append( reward ) # Weird that this is before the step - this is the reward of the previos action print( '[RL] Action %d yielded reward %f and position (%f,%f).' % (act, reward, obs[0][0], obs[0][1])) if run_policy_steps > self.max_steps: done = True if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value - adds zero in the end of the vector reward = -1 break else: obs = next_obs rate.sleep() print('episode_length', run_policy_steps, 'episode_reward', sum(rewards)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), episode_count) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), episode_count) if sum(rewards) >= self.stop_bound: success_num += 1 if success_num >= 100: saver.save( sess, '/home/pracsys/catkin_ws/src/rutgers_collab/src/rl_pkg/logs/model_ppo.ckpt' ) print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list( (self.n_inputs, ))) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, episode_count) if episode_count > self.max_episodes: break rate.sleep() writer.close()
def main(): listener() # env = gym.make('CartPole-v0') # env.seed(0) ob_space = 4 Policy = Policy_net('policy') Old_Policy = Policy_net('old_policy') PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter('./log/train', sess.graph) sess.run(tf.global_variables_initializer()) reset() obs = robot_state.robot_state reward = 0 success_num = 0 for iteration in range(ITERATION): # episode observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) print('act: ', act, 'v_pred: ', v_pred) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) reward, done = take_action(act) time.sleep(0.25) next_obs = robot_state.robot_state if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value reset() obs = robot_state.robot_state reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 render = True if success_num >= 100: saver.save(sess, './model/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, [len(observations), 4]) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) print('gaes', gaes) PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], rewards=inp[2], v_preds_next=inp[3], gaes=inp[4])[0] writer.add_summary(summary, iteration) writer.close()
env = sc2_env.SC2Env( map_name='CollectMineralShards', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=16, feature_minimap=16, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=4, game_steps_per_episode=None, disable_fog=False, visualize=False) with tf.Session() as sess: Policy = Policy_net('policy') Old_Policy = Policy_net('old_policy') PPO = PPOTrain(Policy, Old_Policy) #sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "collectMineralShards/tmp/model.ckpt") for episodes in range(100000): observations = [] actions_list = [] v_preds = [] spatial = [] rewards = [] obs = env.reset()
import tensorflow as tf from ppo import PPOTrain from policy_net import Policy_net import gym import numpy as np sess = tf.Session() Policy = Policy_net(sess, 'policy') Old_Policy = Policy_net(sess, 'old_policy') PPO = PPOTrain(sess, Policy, Old_Policy) sess.run(tf.global_variables_initializer()) env = gym.make('CartPole-v0') sess.run(PPO.assign_ops) for episodes in range(1000): observations = [] actions = [] v_preds = [] rewards = [] done = False state = env.reset() global_step = 0 while not done: global_step += 1 action, value = Policy.act(state) next_state, reward, done, _ = env.step(action) if done: reward = -1 observations.append(state) actions.append(action) v_preds.append(value)
def train(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: Policy = Policy_net('policy', 16 * 16 * 2, 4) Old_Policy = Policy_net('old_policy', 16 * 16 * 2, 4) PPO = PPOTrain(Policy, Old_Policy, gamma=0.95) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = np.array(obs2state(obs)) print('episode start') global_step = 0 reward = 0 observations = [] actions_list = [] v_preds = [] rewards = [] while not done: global_step += 1 time.sleep(0.05) state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) for i in range(1): actions = no_operation(obs) obs = env.step(actions=[actions]) distance = obs2distance(obs) if global_step == 1: pre_distance = distance next_state = np.array(obs2state(obs)) reward = -10 * (distance - pre_distance) #if reward < 0 : # reward = -0.01 #if reward <= 0: # reward = 0 #elif reward > 0: # reward = 0 reward = -0.01 if distance < 0.03 or global_step == 100: # 게임 종료시 if distance < 0.03: reward = 1 if global_step == 200: reward = -1 done = True observations.append(state) actions_list.append(act) v_preds.append(v_pred) rewards.append(reward) if distance < 0.03 or global_step == 100: # 게임 종료시 v_preds_next = v_preds[1:] + [0] gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) observations = np.reshape(observations, newshape=[-1, 16 * 16 * 2]) actions = np.array(actions_list).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) PPO.assign_policy_parameters() inp = [ observations, actions, rewards, v_preds_next, gaes ] for epoch in range(5): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) print(episodes, global_step) break state = next_state pre_distance = distance
def main(args): env = gym.make('CartPole-v1') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) #expert_observations = np.genfromtxt('trajectory/observations.csv') #expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with open(Config.DEMO_DATA_PATH, 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.DEMO_BUFFER_SIZE)) print("demo_transitions len: ", len(demo_transitions)) expert_observations = [data[0] for data in demo_transitions] expert_actions = [data[1] for data in demo_transitions] saver = tf.train.Saver() with tf.Session() as sess: # writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() success_num = 0 itera = 0 scores=[] for iteration in range(args.iteration): observations = [] actions = [] # do NOT use rewards to update policy rewards = [] v_preds = [] run_policy_steps = 0 score=0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) score = reward+score env.render()#0313 observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: itera += 1 next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() print("itera :", itera, "score:{}", score) scores.append(score) break else: obs = next_obs # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration) # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # summary = PPO.get_summary(obs=inp[0], # actions=inp[1], # gaes=inp[2], # rewards=inp[3], # v_preds_next=inp[4]) # # writer.add_summary(summary, iteration) # writer.close() plt.plot(scores, 'r') plt.show()