コード例 #1
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=False) as env:
        r = tf.placeholder(tf.float32)  ########
        rr = tf.summary.scalar('reward', r)
        merged = tf.summary.merge_all()  ########
        expert_observations = np.genfromtxt('trajectory/observations.csv')
        expert_actions = np.genfromtxt('trajectory/actions.csv',
                                       dtype=np.int32)
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            writer = tf.summary.FileWriter('./board/gail',
                                           sess.graph)  ########
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
                observations = np.reshape(observations, newshape=[-1, 2])
                actions_list = np.array(actions_list).astype(dtype=np.int32)
                for i in range(2):
                    sample_indices = (np.random.randint(
                        expert_observations.shape[0],
                        size=observations.shape[0]))
                    inp = [expert_observations, expert_actions]
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    D.train(expert_s=sampled_inp[0],
                            expert_a=sampled_inp[1],
                            agent_s=observations,
                            agent_a=actions_list)
                d_rewards = D.get_rewards(agent_s=observations,
                                          agent_a=actions_list)
                d_rewards = np.reshape(d_rewards,
                                       newshape=[-1]).astype(dtype=np.float32)

                gaes = PPO.get_gaes(rewards=d_rewards,
                                    v_preds=v_preds,
                                    v_preds_next=v_preds_next)
                gaes = np.array(gaes).astype(dtype=np.float32)
                v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

                inp = [
                    observations, actions_list, gaes, d_rewards, v_preds_next
                ]
                PPO.assign_policy_parameters()
                for epoch in range(15):
                    sample_indices = np.random.randint(
                        low=0, high=observations.shape[0],
                        size=32)  # indices are in [low, high)
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    PPO.train(obs=sampled_inp[0],
                              actions=sampled_inp[1],
                              gaes=sampled_inp[2],
                              rewards=sampled_inp[3],
                              v_preds_next=sampled_inp[4])
                summary = sess.run(merged, feed_dict={r: global_step})
                writer.add_summary(summary, episodes)
                if global_step < 50:
                    c += 1
                else:
                    c = 0
                if c > 10:
                    saver.save(sess, './model/gail.cpkt')
                    print('save model')
                    break
                print(episodes, global_step, c)
コード例 #2
0
ファイル: my_main.py プロジェクト: kangyongxin/Backup
def run_gail(agent, index_gail, env):
    DG_flag = 1
    #env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy_' + str(index_gail), env)
    Old_Policy = Policy_net('old_policy' + str(index_gail), env)
    gamma = 0.95
    PPO = PPOTrain(Policy, Old_Policy, gamma)
    D = Discriminator(env, index_gail)

    if DG_flag:
        # with open(Config.DEMO_DATA_PATH, 'rb') as f:
        #     demo_transitions = pickle.load(f)
        #     demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.demo_buffer_size))
        #     assert len(demo_transitions) == Config.demo_buffer_size
        expert_data = agent.replay_memory if agent.replay_memory.full(
        ) else agent.demo_memory
        _, demo_transitions, _ = expert_data.sample(agent.config.BATCH_SIZE)
        expert_observations = [data[0] for data in demo_transitions]
        expert_actions = [data[1] for data in demo_transitions]
    else:
        expert_observations = np.genfromtxt('trajectory/observations.csv')
        expert_actions = np.genfromtxt('trajectory/actions.csv',
                                       dtype=np.int32)

    with tf.Session() as sess:
        # writer = tf.summary.FileWriter(args.logdir, sess.graph)
        #load_path=saver.restore(sess,"trained_models/model.ckpt")
        #sess.run(tf.global_variables_initializer())
        #if index_gail>1:
        #   saver.restore(sess, 'trained_models/model' + str(index_gail-1) + '.ckpt')

        obs = env.reset()
        state_for_memory = obs  #为了处理两套程序中使用的数据格式不同
        success_num = 0
        iteration = int(2000)  #0319
        for iteration in range(iteration):
            #print("running policy ")
            observations = []
            #states_for_memory=[]
            actions = []
            # do NOT use rewards to update policy , # 0319 why ?
            rewards = []
            v_preds = []
            run_policy_steps = 0
            score = 0
            if DG_flag:
                t_q = deque(maxlen=Config.trajectory_n)
                done, score, n_step_reward, state_for_memory = False, 0, None, env.reset(
                )
            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                next_obs, reward, done, info = env.step(act)
                next_state_for_memory = next_obs
                score += reward
                if DG_flag:
                    reward_to_sub = 0. if len(t_q) < t_q.maxlen else t_q[0][
                        2]  # record the earliest reward for the sub
                    t_q.append([
                        state_for_memory, act, reward, next_state_for_memory,
                        done, 0.0
                    ])
                    if len(t_q) == t_q.maxlen:
                        if n_step_reward is None:  # only compute once when t_q first filled
                            n_step_reward = sum([
                                t[2] * Config.GAMMA**i
                                for i, t in enumerate(t_q)
                            ])
                        else:
                            n_step_reward = (n_step_reward -
                                             reward_to_sub) / Config.GAMMA
                            n_step_reward += reward * Config.GAMMA**(
                                Config.trajectory_n - 1)
                        t_q[0].extend([
                            n_step_reward, next_state_for_memory, done,
                            t_q.maxlen
                        ])  # actual_n is max_len here
                        #agent.perceive(t_q[0])  # perceive when a transition is completed

                env.render()  # 0313
                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    if DG_flag:
                        t_q.popleft(
                        )  # first transition's n-step is already set
                        transitions = set_n_step(t_q, Config.trajectory_n)
                    next_obs = np.stack([next_obs]).astype(
                        dtype=np.float32
                    )  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs = env.reset()
                    print("iteration", iteration, "score", score)
                    break
                else:
                    obs = next_obs
                    state_for_memory = next_state_for_memory
                #print("state_for memory",state_for_memory)
            #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration)
            #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration)

            #
            if sum(rewards) >= 100:

                success_num += 1
                # todo
                # 在 能够得到较好的回报 的时候 存储这次的demo
                if DG_flag:
                    for t in transitions:
                        agent.perceive(t)
                        agent.replay_memory.memory_len()

                if success_num >= 3:
                    #saver.save(sess, 'trained_models/model.ckpt')
                    #saver.save(sess, 'trained_models/model' + str(index_gail) + '.ckpt')
                    print(success_num)
                    print('Clear!! Model saved.')
                    env.close()
                    break
            else:
                success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                #print("training D")
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator is reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards,
                                   newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            PPO.assign_policy_parameters()
            for epoch in range(6):
                #print("updating PPO ")
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=32)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])
コード例 #3
0
ファイル: run_gail.py プロジェクト: kangyongxin/Backup
def main(args):
    env = gym.make('CartPole-v1')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    D = Discriminator(env)

    #expert_observations = np.genfromtxt('trajectory/observations.csv')
    #expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)
    with open(Config.DEMO_DATA_PATH, 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(itertools.islice(demo_transitions, 0, Config.DEMO_BUFFER_SIZE))
        print("demo_transitions len: ", len(demo_transitions))
    expert_observations = [data[0] for data in demo_transitions]
    expert_actions = [data[1] for data in demo_transitions]

    saver = tf.train.Saver()

    with tf.Session() as sess:
        # writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())

        obs = env.reset()
        success_num = 0
        itera = 0
        scores=[]
        for iteration in range(args.iteration):
            observations = []
            actions = []
            # do NOT use rewards to update policy
            rewards = []
            v_preds = []
            run_policy_steps = 0
            score=0

            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                next_obs, reward, done, info = env.step(act)
                score = reward+score
                env.render()#0313

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    itera += 1
                    next_obs = np.stack([next_obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs = env.reset()
                    print("itera :", itera, "score:{}", score)
                    scores.append(score)
                    break
                else:
                    obs = next_obs

            # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]), iteration)
            # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, args.savedir + '/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator is reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            PPO.assign_policy_parameters()
            for epoch in range(6):
                sample_indices = np.random.randint(low=0, high=observations.shape[0],
                                                   size=32)  # indices are in [low, high)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

        #     summary = PPO.get_summary(obs=inp[0],
        #                               actions=inp[1],
        #                               gaes=inp[2],
        #                               rewards=inp[3],
        #                               v_preds_next=inp[4])
        #
        #     writer.add_summary(summary, iteration)
        # writer.close()

    plt.plot(scores, 'r')
    plt.show()