Example #1
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
                        map_name='MoveToBeacon',
                        agent_interface_format=sc2_env.parse_agent_interface_format(
                            feature_screen=64,
                            feature_minimap=64,
                            rgb_screen=None,
                            rgb_minimap=None,
                            action_space=None,
                            use_feature_units=False),
                        step_mul=step_mul,
                        game_steps_per_episode=None,
                        disable_fog=False,
                    visualize=True) as env:
        sess = tf.Session()
        actor = Actor(sess, n_features=2, n_actions=4, lr=0.001)
        critic = Critic(sess, n_features=2, lr=0.001)
        sess.run(tf.global_variables_initializer())
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100,obs)
                obs = env.step(actions=[actions])
            state = np.array(obs2state(obs))
            #print('episode start')
            global_step = 0
            reward = 0
            while not done: 
                global_step += 1
                action = actor.choose_action(state)
                actions = actAgent2Pysc2(action,obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = np.array(obs2state(obs))
                reward = -(distance-pre_distance)*400
                
                if distance < 0.03 or global_step == 200:   # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True
                
                td_error = critic.learn(state, reward, next_state)
                actor.learn(state, action, td_error)

                if distance < 0.03 or global_step == 200:   # 게임 종료시
                    break
                state = next_state
                pre_distance = distance
Example #2
0
def worker(remote, visualize):
    env = sc2_env.SC2Env(
        map_name='CollectMineralShards',
        agent_interface_format=sc2_env.parse_agent_interface_format(
            feature_screen=32,
            feature_minimap=32,
            rgb_screen=None,
            rgb_minimap=None,
            action_space=None,
            use_feature_units=False),
        step_mul=4,
        game_steps_per_episode=None,
        disable_fog=False,
        visualize=True)
    done = False
    pre_num_mineral = 20
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 400
        if cmd == 'step':
            if not action == 'done':
                available_action, state = obs
                a = actAgent2Pysc2(action, state)
                obs = env.step(actions=[a])
                state = obs2state(obs)
                obs = (obs[0].observation.available_actions, state)

                #reward shaping
                mineral_map = state.reshape(32, 32, 2)
                num_mineral = np.sum(mineral_map[:, :, 1])

                reward = -0.1
                if num_mineral != pre_num_mineral:  # when getting mineral
                    reward = -num_mineral + pre_num_mineral

                if num_mineral <= 2 or global_step == end_step - 1:  # when getting all mineral or failed to getting all mineral
                    done = True

                pre_num_mineral = num_mineral
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            pre_num_mineral = 20
            done = False
            obs = env.reset()  #env 초기화
            while not 331 in obs[0].observation.available_actions:  #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            obs = (obs[0].observation.available_actions, state)
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
Example #3
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=True) as env:
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            saver = tf.train.Saver()
            saver.restore(sess, './model/gail.cpkt')
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
Example #4
0
def worker(remote, visualize):
    env = sc2_env.SC2Env(
        map_name='MoveToBeacon',
        agent_interface_format=sc2_env.parse_agent_interface_format(
            feature_screen=32,
            feature_minimap=32,
            rgb_screen=None,
            rgb_minimap=None,
            action_space=None,
            use_feature_units=False),
        step_mul=4,
        game_steps_per_episode=None,
        disable_fog=False,
        visualize=False)
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 400
        if cmd == 'step':
            if not action == 'done':
                available_action, state = obs
                a = actAgent2Pysc2(action, state)
                obs = env.step(actions=[a])
                distance = obs2distance(obs)
                state = obs2state(obs)
                obs = (obs[0].observation.available_actions, state)

                #reward shaping

                reward = -0.1
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    if global_step == end_step - 1:
                        reward = -1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            pre_num_mineral = 20
            done = False
            obs = env.reset()  #env 초기화
            while not 331 in obs[0].observation.available_actions:  #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            obs = (obs[0].observation.available_actions, state)
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
Example #5
0
def worker(remote, visualize):
    env =  sc2_env.SC2Env(
                        map_name='MoveToBeacon',
                        agent_interface_format=sc2_env.parse_agent_interface_format(
                            feature_screen=64,
                            feature_minimap=64,
                            rgb_screen=None,
                            rgb_minimap=None,
                            action_space=None,
                            use_feature_units=False),
                        step_mul=4,
                        game_steps_per_episode=None,
                        disable_fog=False,
                    visualize=True)
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 100
        if cmd == 'step':
            if not action == 'done':
                #while not 331 in obs[0].observation['available_actions']:   #마린을 선택하기
                #    actions = actAgent2Pysc2(100, obs)
                #    obs = env.step(actions=[actions])
                a = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[a])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                distance = obs2distance(obs)
                reward = -0.1
                obs = obs[0].observation.feature_screen.base[4]
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            done = False
            obs = env.reset()          #env 초기화
            while not 331 in obs[0].observation['available_actions']:   #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            obs = obs[0].observation.feature_screen.base[4]
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
Example #6
0
def test():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        sess = tf.Session()
        mainDQN = DQN(sess, 2, 4, name='main')
        targetDQN = DQN(sess, 2, 4, name='target')
        #sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess, './Move2Beacon(DQN)/model.cpkt')
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            print('episode start')
            global_step = 0
            random_rate = 0
            e = 1. / ((episodes / 10) + 1)
            reward = 0
            while not done:
                time.sleep(0.13)
                global_step += 1

                action = np.argmax(mainDQN.predict(state))
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = obs2state(obs)
                reward = -(distance - pre_distance) * 400
                if distance < 0.015 or global_step == 200:  # 게임 종료시
                    done = True

                if distance < 0.015 or global_step == 200:  # 게임 종료시
                    print(reward, episodes, random_rate / global_step)
                    break
                state = next_state
                pre_distance = distance
Example #7
0
def worker(remote, visualize):
    env = sc2_env.SC2Env(map_name='MoveToBeacon',
                         step_mul=4,
                         visualize=visualize,
                         screen_size_px=(64, 64),
                         minimap_size_px=(64, 64))
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 100
        if cmd == 'step':
            if not action == 'done':
                while not 331 in obs[0].observation[
                        'available_actions']:  #마린을 선택하기
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                a = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[a])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                distance = obs2distance(obs)
                reward = -0.01
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    if global_step == end_step - 1:
                        reward = -1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            done = False
            obs = env.reset()  #env 초기화
            while not 331 in obs[0].observation['available_actions']:  #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
Example #8
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        sess = tf.Session()
        actor = Actor(sess, n_features=2, n_actions=4, lr=0.001)
        critic = Critic(sess, n_features=2, lr=0.001)
        sess.run(tf.global_variables_initializer())
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = np.array(obs2state(obs))
            print('episode start')
            global_step = 0
            reward = 0
            while not done:
                global_step += 1
                time.sleep(0.2)
                action = actor.choose_action(state)
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = np.array(obs2state(obs))
                reward = -(distance - pre_distance) * 400

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True

                td_error = critic.learn(state, reward, next_state)
                actor.learn(state, action, td_error)

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    break
                state = next_state
                pre_distance = distance
Example #9
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon",
                        step_mul=step_mul,
                        screen_size_px=(16, 16),
                        minimap_size_px=(16, 16)) as env:
        Policy = Policy_net('policy', 16 * 16 * 2, 4)
        Old_Policy = Policy_net('old_policy', 16 * 16 * 2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = np.array(obs2state(obs))
                print('episode start')
                global_step = 0
                reward = 0

                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                while not done:
                    global_step += 1
                    time.sleep(0.05)

                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])

                    for i in range(1):
                        actions = no_operation(obs)
                        obs = env.step(actions=[actions])
                    distance = obs2distance(obs)
                    if global_step == 1:
                        pre_distance = distance
                    next_state = np.array(obs2state(obs))
                    reward = -10 * (distance - pre_distance)
                    #if reward < 0 :
                    #    reward = -0.01
                    #if reward <= 0:
                    #    reward = 0
                    #elif reward > 0:
                    #    reward = 0
                    reward = -0.01
                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        if distance < 0.03:
                            reward = 1
                        if global_step == 200:
                            reward = -1
                        done = True

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards,
                                            v_preds=v_preds,
                                            v_preds_next=v_preds_next)
                        observations = np.reshape(observations,
                                                  newshape=[-1, 16 * 16 * 2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(dtype=np.float32)
                        v_preds_next = np.array(v_preds_next).astype(
                            dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())

                        PPO.assign_policy_parameters()

                        inp = [
                            observations, actions, rewards, v_preds_next, gaes
                        ]
                        for epoch in range(5):
                            sample_indices = np.random.randint(
                                low=0, high=observations.shape[0],
                                size=64)  # indices are in [low, high)
                            sampled_inp = [
                                np.take(a=a, indices=sample_indices, axis=0)
                                for a in inp
                            ]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                      actions=sampled_inp[1],
                                      rewards=sampled_inp[2],
                                      v_preds_next=sampled_inp[3],
                                      gaes=sampled_inp[4])
                        print(episodes, global_step)
                        break
                    state = next_state
                    pre_distance = distance
Example #10
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=False) as env:
        r = tf.placeholder(tf.float32)  ########
        rr = tf.summary.scalar('reward', r)
        merged = tf.summary.merge_all()  ########
        expert_observations = np.genfromtxt('trajectory/observations.csv')
        expert_actions = np.genfromtxt('trajectory/actions.csv',
                                       dtype=np.int32)
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            writer = tf.summary.FileWriter('./board/gail',
                                           sess.graph)  ########
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
                observations = np.reshape(observations, newshape=[-1, 2])
                actions_list = np.array(actions_list).astype(dtype=np.int32)
                for i in range(2):
                    sample_indices = (np.random.randint(
                        expert_observations.shape[0],
                        size=observations.shape[0]))
                    inp = [expert_observations, expert_actions]
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    D.train(expert_s=sampled_inp[0],
                            expert_a=sampled_inp[1],
                            agent_s=observations,
                            agent_a=actions_list)
                d_rewards = D.get_rewards(agent_s=observations,
                                          agent_a=actions_list)
                d_rewards = np.reshape(d_rewards,
                                       newshape=[-1]).astype(dtype=np.float32)

                gaes = PPO.get_gaes(rewards=d_rewards,
                                    v_preds=v_preds,
                                    v_preds_next=v_preds_next)
                gaes = np.array(gaes).astype(dtype=np.float32)
                v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

                inp = [
                    observations, actions_list, gaes, d_rewards, v_preds_next
                ]
                PPO.assign_policy_parameters()
                for epoch in range(15):
                    sample_indices = np.random.randint(
                        low=0, high=observations.shape[0],
                        size=32)  # indices are in [low, high)
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    PPO.train(obs=sampled_inp[0],
                              actions=sampled_inp[1],
                              gaes=sampled_inp[2],
                              rewards=sampled_inp[3],
                              v_preds_next=sampled_inp[4])
                summary = sess.run(merged, feed_dict={r: global_step})
                writer.add_summary(summary, episodes)
                if global_step < 50:
                    c += 1
                else:
                    c = 0
                if c > 10:
                    saver.save(sess, './model/gail.cpkt')
                    print('save model')
                    break
                print(episodes, global_step, c)
Example #11
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        replay_buffer = deque(maxlen=1000)
        sess = tf.Session()
        mainDQN = DQN(sess, 2, 4, name='main')
        targetDQN = DQN(sess, 2, 4, name='target')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        #saver.restore(sess, './Move2Beacon/model.cpkt')
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            print('episode start')
            global_step = 0
            random_rate = 0
            e = 1. / ((episodes / 10) + 1)
            reward = 0
            while not done:
                global_step += 1
                time.sleep(0.05)
                if np.random.rand() < e:
                    random_rate += 1
                    action = random.randrange(4)
                else:
                    action = np.argmax(mainDQN.predict(state))
                #action = np.argmax(mainDQN.predict(state))
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = obs2state(obs)
                reward = -(distance - pre_distance) * 400
                #print(reward)
                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True

                #print(next_state, reward)
                replay_buffer.append((state, action, reward, next_state, done))

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if len(replay_buffer) > BATCH_SIZE:
                        minibatch = random.sample(replay_buffer, BATCH_SIZE)
                        loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                        sess.run(copy_ops)
                        print('model trained')
                        saver.save(sess, './Move2Beacon/model.cpkt')
                    print(reward, episodes, random_rate / global_step)
                    break
                state = next_state
                pre_distance = distance
Example #12
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="CollectMineralShards", step_mul=step_mul,
                        screen_size_px=(32, 32), minimap_size_px=(32, 32)) as env:
        Policy = Policy_net('policy', 32*32*2, 4)
        Old_Policy = Policy_net('old_policy', 32*32*2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        saver = tf.train.Saver()
        with tf.Session() as sess:
            print('a')
            saver.restore(sess, './model/model.ckpt')
            print('a')
            #sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                actions = gather(obs)
                obs = env.step(actions=[actions])
                end_step = 200
                global_step = 0
                score = 0
                reward = 0
                for i in range(100):
                    time.sleep(0.01)
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])       
                state = obs2state(obs)
                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                print('episode start')
                while not done:
                    global_step += 1
                    time.sleep(0.05)
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    #while not 331 in obs[0].observation["available_actions"]:
                    #    actions = actAgent2Pysc2(100, obs)
                    #    obs = env.step(actions=[actions])
                    obs = env.step(actions=[actions])
                    
                    if global_step == end_step or obs2done(obs) >= 1900 :    # 게임 time을 다 사용하거나 미네랄을 다 먹었을 경우 게임이 끝난다.
                        done = True
                    
                    next_state = obs2state(obs)
                    reward = obs[0].reward

                    if reward == 0:
                        reward = -0.1

                    if done:
                        if obs2done(obs) >= 1900:   # 게임이 종료되었는데 미네랄을 다 먹었으면
                            reward = 3
                        else:                       # 게임이 종료되었는데 미네랄을 다 못먹으면
                            reward = -3   

                    score += reward

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if done:   # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)
                        observations = np.reshape(observations, newshape=[-1, 32*32*2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(np.float32)
                        v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())
                        PPO.assign_policy_parameters()
                        inp = [observations, actions, rewards, v_preds_next, gaes]
                        for epoch in range(5):
                            sample_indices = np.random.randint(low=0, high=observations.shape[0], size=64)
                            sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                    actions=sampled_inp[1],
                                    rewards=sampled_inp[2],
                                    v_preds_next=sampled_inp[3],
                                    gaes=sampled_inp[4])
                        print(episodes, score)
                        save_path = saver.save(sess, './model/model.ckpt')
                        if episodes == 0:
                            f = open('test2.csv', 'w', encoding='utf-8', newline='')
                        else:
                            f = open('test2.csv', 'a', encoding='utf-8', newline='')
                        wr = csv.writer(f)
                        wr.writerow([episodes, score])
                        f.close()
                        break
                    state = next_state