def train(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=step_mul, game_steps_per_episode=None, disable_fog=False, visualize=False) as env: r = tf.placeholder(tf.float32) ######## rr = tf.summary.scalar('reward', r) merged = tf.summary.merge_all() ######## expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with tf.Session() as sess: Policy = Policy_net('policy', 2, 4) Old_Policy = Policy_net('old_policy', 2, 4) PPO = PPOTrain(Policy, Old_Policy) D = Discriminator() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() writer = tf.summary.FileWriter('./board/gail', sess.graph) ######## c = 0 for episodes in range(100000): done = False obs = env.reset() while not 331 in obs[0].observation.available_actions: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] rewards = [] v_preds = [] reward = 0 global_step = 0 while not done: global_step += 1 state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) observations.append(state) actions_list.append(act) rewards.append(reward) v_preds.append(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) next_state = obs2state(obs) distance = obs2distance(obs) if distance < 0.03 or global_step == 100: done = True if done: v_preds_next = v_preds[1:] + [0] break state = next_state observations = np.reshape(observations, newshape=[-1, 2]) actions_list = np.array(actions_list).astype(dtype=np.int32) for i in range(2): sample_indices = (np.random.randint( expert_observations.shape[0], size=observations.shape[0])) inp = [expert_observations, expert_actions] sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data D.train(expert_s=sampled_inp[0], expert_a=sampled_inp[1], agent_s=observations, agent_a=actions_list) d_rewards = D.get_rewards(agent_s=observations, agent_a=actions_list) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) inp = [ observations, actions_list, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters() for epoch in range(15): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = sess.run(merged, feed_dict={r: global_step}) writer.add_summary(summary, episodes) if global_step < 50: c += 1 else: c = 0 if c > 10: saver.save(sess, './model/gail.cpkt') print('save model') break print(episodes, global_step, c)
def run_gail(agent, index_gail, env): DG_flag = 1 #env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy_' + str(index_gail), env) Old_Policy = Policy_net('old_policy' + str(index_gail), env) gamma = 0.95 PPO = PPOTrain(Policy, Old_Policy, gamma) D = Discriminator(env, index_gail) if DG_flag: # with open(Config.DEMO_DATA_PATH, 'rb') as f: # demo_transitions = pickle.load(f) # demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.demo_buffer_size)) # assert len(demo_transitions) == Config.demo_buffer_size expert_data = agent.replay_memory if agent.replay_memory.full( ) else agent.demo_memory _, demo_transitions, _ = expert_data.sample(agent.config.BATCH_SIZE) expert_observations = [data[0] for data in demo_transitions] expert_actions = [data[1] for data in demo_transitions] else: expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with tf.Session() as sess: # writer = tf.summary.FileWriter(args.logdir, sess.graph) #load_path=saver.restore(sess,"trained_models/model.ckpt") #sess.run(tf.global_variables_initializer()) #if index_gail>1: # saver.restore(sess, 'trained_models/model' + str(index_gail-1) + '.ckpt') obs = env.reset() state_for_memory = obs #为了处理两套程序中使用的数据格式不同 success_num = 0 iteration = int(2000) #0319 for iteration in range(iteration): #print("running policy ") observations = [] #states_for_memory=[] actions = [] # do NOT use rewards to update policy , # 0319 why ? rewards = [] v_preds = [] run_policy_steps = 0 score = 0 if DG_flag: t_q = deque(maxlen=Config.trajectory_n) done, score, n_step_reward, state_for_memory = False, 0, None, env.reset( ) while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) next_state_for_memory = next_obs score += reward if DG_flag: reward_to_sub = 0. if len(t_q) < t_q.maxlen else t_q[0][ 2] # record the earliest reward for the sub t_q.append([ state_for_memory, act, reward, next_state_for_memory, done, 0.0 ]) if len(t_q) == t_q.maxlen: if n_step_reward is None: # only compute once when t_q first filled n_step_reward = sum([ t[2] * Config.GAMMA**i for i, t in enumerate(t_q) ]) else: n_step_reward = (n_step_reward - reward_to_sub) / Config.GAMMA n_step_reward += reward * Config.GAMMA**( Config.trajectory_n - 1) t_q[0].extend([ n_step_reward, next_state_for_memory, done, t_q.maxlen ]) # actual_n is max_len here #agent.perceive(t_q[0]) # perceive when a transition is completed env.render() # 0313 observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: if DG_flag: t_q.popleft( ) # first transition's n-step is already set transitions = set_n_step(t_q, Config.trajectory_n) next_obs = np.stack([next_obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() print("iteration", iteration, "score", score) break else: obs = next_obs state_for_memory = next_state_for_memory #print("state_for memory",state_for_memory) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration) # if sum(rewards) >= 100: success_num += 1 # todo # 在 能够得到较好的回报 的时候 存储这次的demo if DG_flag: for t in transitions: agent.perceive(t) agent.replay_memory.memory_len() if success_num >= 3: #saver.save(sess, 'trained_models/model.ckpt') #saver.save(sess, 'trained_models/model' + str(index_gail) + '.ckpt') print(success_num) print('Clear!! Model saved.') env.close() break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): #print("training D") D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): #print("updating PPO ") sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4])
def main(args): env = gym.make('CartPole-v1') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) #expert_observations = np.genfromtxt('trajectory/observations.csv') #expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with open(Config.DEMO_DATA_PATH, 'rb') as f: demo_transitions = pickle.load(f) demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.DEMO_BUFFER_SIZE)) print("demo_transitions len: ", len(demo_transitions)) expert_observations = [data[0] for data in demo_transitions] expert_actions = [data[1] for data in demo_transitions] saver = tf.train.Saver() with tf.Session() as sess: # writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() success_num = 0 itera = 0 scores=[] for iteration in range(args.iteration): observations = [] actions = [] # do NOT use rewards to update policy rewards = [] v_preds = [] run_policy_steps = 0 score=0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) score = reward+score env.render()#0313 observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: itera += 1 next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() print("itera :", itera, "score:{}", score) scores.append(score) break else: obs = next_obs # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration) # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # summary = PPO.get_summary(obs=inp[0], # actions=inp[1], # gaes=inp[2], # rewards=inp[3], # v_preds_next=inp[4]) # # writer.add_summary(summary, iteration) # writer.close() plt.plot(scores, 'r') plt.show()