Ejemplo n.º 1
0
def test():
    env = gym.make(GAME).unwrapped

    agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN,
                discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR,
                mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS)

    agent.load_weights(SAVE_INDEX)
    # env_reset
    # state = env.reset()
    # print(state)
    # state = env.reset()
    # print(state)
    state = env.reset()
    print(state)
    steps = 0
    episode_r = 0
    all_value = []

    while steps < 1000:
        # env.render()
        # get action
        action = agent.choose_action(state)
        # execute one action
        state_after_action, reward, done, _ = env.step(action)
        steps += 1
        episode_r += reward
        state = state_after_action
        state_value = agent.get_value(state)
        all_value.append(state_value)

    plt.plot(np.arange(len(all_value)), all_value)
    plt.xlabel('state')
    plt.ylabel('state value')
    plt.show()

    create_path('figure/'+SAVE_INDEX+'/figure')
    plt.savefig('weights/'+SAVE_INDEX+'/figure/fig.png')

    print("test 1000 steps, got reward: %i" % episode_r)
Ejemplo n.º 2
0
from ppo import PPO
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from gym_unity.envs import UnityEnv

GameDir = 'test.app'
multi_thread = True
if multi_thread:
    modelPath = "multi_thread_Model/"
else:
    modelPath = "single_thread_Model/"

MAXEP = 1000
if __name__ == "__main__":
    env = UnityEnv('test.app', 0, use_visual=True)
    ppo = PPO(env, load=True, testing=True, ModelPath=modelPath)
    for ep in range(MAXEP):
        s = env.reset()
        ep_r = 0
        done = False
        while not done:
            env.render()
            a, v = ppo.choose_action(s)
            s_, r, done, _ = env.step(a)
            s = s_
            ep_r += r
        print("episode = {}, ep_r = {}".format(ep, ep_r))
Ejemplo n.º 3
0
class Agent:
    def __init__(self, traci, is_not_train=False):
        self.controlTLIds = traci.trafficlight.getIDList()  # tuple ('0',)
        self.controlTLIds = self.controlTLIds[0]  # string '0'
        self.phaseDefs = ['GrrrGrrr', 'rGrrrGrr', 'rrGrrrGr', 'rrrGrrrG']
        self.yelloPhases = ['yrrryrrr', 'ryrrryrr', 'rryrrryr', 'rrryrrry']
        action_mask = [1, 1, 1, 1]
        self.detectorIDs = traci.inductionloop.getIDList()
        self.controlLanes = get_laneID(self.detectorIDs)
        self.reset()
        state_size = len(self.state)
        self.learner = Learner(state_size, action_mask, is_not_train)
        return

    # reset before each epoch
    def reset(self):
        self.prev_avg_wait_time = 0
        self.totalReward = 0
        self.totalCost = 0
        self.state = self._get_state()
        self.action = 1

    def _get_state(self):
        # 生成输入神经网络的state
        state = get_queue_state(self.controlLanes) + get_head_waiting_time(
            self.controlLanes)
        state = np.array(state)
        return state

    def setRYG(self, index, is_yello):
        light = self.controlTLIds
        if is_yello:
            ryg = self.yelloPhases[index]
        else:
            ryg = self.phaseDefs[index]
        traci.trafficlight.setRedYellowGreenState(light, ryg)

    def getPhase(self):
        light = self.controlTLIds
        phase_state = traci.trafficlight.getRedYellowGreenState(light)
        for i in range(len(self.phaseDefs)):
            if phase_state == self.phaseDefs[i]:
                phase = i
            if phase_state == self.yelloPhases[i]:
                phase = i
        return phase

    def agentAct(self):
        self.currentPhase = self.getPhase()
        self.action = self.learner.choose_action(self.state)
        return self.action, self.currentPhase

    def agentCulReward(self, is_sim=False):
        next_state = self._get_state()

        avg_wait_time = get_avarage_waiting_time(self.controlLanes)
        reward = self.prev_avg_wait_time - avg_wait_time
        self.prev_avg_wait_time = avg_wait_time

        self.totalReward += reward
        self.totalCost += avg_wait_time

        if not is_sim:
            self.learner.learn(self.state, self.action, reward, next_state)

        self.state = next_state
        return reward, avg_wait_time

    def printLog(self, outfile, simulation):
        print("************agent:simulation output*********")
        print("Simulation {} group {}: totalCost {}, totalReward {}\n".format(
            simulation, self.controlTLIds, self.totalCost, self.totalReward))
        outfile.write(
            "Simulation {} group {}: totalCost {}, totalReward {}\n".format(
                simulation, self.controlTLIds, self.totalCost,
                self.totalReward))
        outfile.flush()
    defender_rewards = []
    attacker_rewards = []
    state = field.reset()  # (1, s_dim)
    defender.init_ac(np.array([state]))
    attacker.init_ac(np.array([state]))
    running_mean_a = []
    running_mean_d = []
    all_rewards_a = []
    all_rewards_d = []

    for i in range(100):
        # fix defender, train attacker
        print("Training attacker ... iter", i + 1)
        for ep in range(MAX_EP_A):
            state = field.get_state()
            action_d, prob_d = defender.choose_action(np.array([state]))
            pun_d = field.defense(action_d)
            r_d = -pun_d

            action_a, prob_a = attacker.choose_action(np.array([state]))
            reward_a, pun_a = field.attack(action_a)
            r_a = reward_a - pun_a
            r_d -= reward_a

            attacker_memory.memorize(state, action_a, r_a, prob_a, True, None)
            running_mean_d.append(r_d)
            running_mean_a.append(r_a)

            state = field.next()

            if (ep + 1) % BATCH_SIZE == 0 or ep == MAX_EP_A - 1:
Ejemplo n.º 5
0
from ppo import PPO
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
multi_thread = True
if multi_thread:
    modelPath = "multi_thread_Model/"
else:
    modelPath = "single_thread_Model/"

MAXEP = 1000
if __name__ == "__main__":
    ppo = PPO(None, load=True, testing=True, ModelPath=modelPath)
    for ep in range(MAXEP):
        s = 1  # first image
        ep_r = 0
        done = False
        while not done:
            a, v = ppo.choose_action(
                s)  # a == 1 press D, a == 2 press F, a == 0 do nothing
            s_ = 2  # push the buttom and return the image
            s = s_
Ejemplo n.º 6
0
def train(nameIndx):
    global l_run, r_run
    T_REWARD = []
    MU_REWARD = 0
    BEST_R = 0
    env = Test(nameIndx)  #0 = right

    # agent = DDPG(a_dim, s_dim, a_bound, SIDE[nameIndx])
    agent = PPO(act_dim=7,
                obs_dim=39,
                lr_actor=0.0001,
                lr_value=0.0002,
                gamma=0.9,
                clip_range=0.2,
                name=SIDE[nameIndx])

    var = 0.8  # control exploration
    rar = 0.3
    cnt = 0

    if nameIndx == 0:
        r_run = False
    elif nameIndx == 1:
        l_run = False
    while r_run or l_run:
        time.sleep(0)
    time.sleep(0.5)

    t1 = time.time()
    while int(time.time()) % 5 != 0:
        time.sleep(0)
    for i in range(MAX_EPISODES):
        if nameIndx == 0:
            r_run = False
        elif nameIndx == 1:
            l_run = False
        while r_run or l_run:
            time.sleep(0)

        s = env.reset()

        time.sleep(0.1)
        if nameIndx == 0:
            r_run = True
        elif nameIndx == 1:
            l_run = True

        ep_reward = 0
        for j in range(MAX_EP_STEPS):
            a, neglogp, _ = agent.choose_action(s)
            # a = np.clip(np.random.normal(a, var), -1, 1)    # add randomness to action selection for exploration
            s_, r, done, info = env.step(a)

            agent.memory.store_transition(s, a, (r + 8) / 8, neglogp)
            if (j + 1) % 32 == 0 or j == MAX_EP_STEPS - 1:
                _, _, last_value = agent.choose_action(s_)
                agent.learn(last_value, done, cnt)
            s = s_
            ep_reward += r
            cnt += 1

        if len(T_REWARD) >= 100:
            T_REWARD.pop(0)
        T_REWARD.append(ep_reward)
        r_sum = 0
        for k in T_REWARD:
            r_sum += k
        MU_REWARD = r_sum / 100
        BEST_R = MU_REWARD if MU_REWARD > BEST_R else BEST_R
        print(
            'Episode:', i, ' Reward: %i' % int(ep_reward), 'MU_REWARD: ',
            int(MU_REWARD), 'BEST_R: ', int(BEST_R), 'cnt = ', j
        )  # , 't_step:', int(t23), 't_learn: ', int(t32)) #'var: %.3f' % var, 'rar: %.3f' % rar)
        if MU_REWARD > GOAL_REWARD:
            break

    if os.path.isdir(agent.path): shutil.rmtree(agent.path)
    os.mkdir(agent.path)
    ckpt_path = os.path.join(agent.path, 'DDPG.ckpt')
    save_path = agent.saver.save(agent.sess, ckpt_path, write_meta_graph=False)
    print("\nSave Model %s\n" % save_path)
    print('Running time: ', time.time() - t1)