Esempio n. 1
0
    def reset(self):
        if self.gif:
            self.save_gif()

        # If pixex input, we reset the image buffer with random states
        if self.pixel_input:
            self.env = game.GameState(1, False)
            for i in range(4):
                frame, r, d = self.env.frame_step([1, 0], render=self.render)
                self.frame_buffer.append(self.process(frame))
            return np.transpose(self.frame_buffer, (1, 2, 0))
        else:
            self.env = game.GameState(1, False)
            s, r, d = self.env.frame_step([1, 0], render=self.render)
            return s
Esempio n. 2
0
    def train(self, episode, batch_size=64, freq=100):
        self.batch_size = batch_size
        tqdm_e = tqdm(range(episode))
        env = game.GameState()

        for i in tqdm_e:
            state = env.reset()
            cum_r = 0
            done = False
            while not done:
                # STATUS = "explore"
                state_newaxis = state[np.newaxis, :]
                action = self.agent.e_greedy_action(state_newaxis)
                action_array = np.array([0, 0])
                action_array[action] = 1
                next_state, reward, done = env.step(action_array)
                action_onehot = to_categorical(action, self.n_action)
                ob = (state, reward, done, action_onehot, next_state)
                self.sampling_pool.add_to_buffer(ob)
                state = next_state
                cum_r += reward

                if (self.sampling_pool.get_size() > self.batch_size):
                    self.train_agent()
                    STATUS = "train"
                    if i % freq == 0:
                        self.agent.transfer_weights()
                        STATUS = "transfer weights"
            self.cum_r.append(cum_r)
            if (i > 10000) & (not (i % 10000)):
                self.save_model(f"{i}-eps-.h5")
            tqdm_e.set_description("Score: " + str(cum_r) + "\n Status" +
                                   STATUS)
            tqdm_e.refresh()
        self.save_model(f"final-{i}-eps-.h5")
Esempio n. 3
0
 def test_Network(self):
     #打开游戏状态与模拟器进行通信
     game_state = game.GameState()
     #获得第一个状态并将图像进行预处理
     do_nothing = np.zeros(ACTIONS)
     do_nothing[0] = 1
     #与游戏交互一次
     x_t, r_0, terminal = game_state.frame_step(do_nothing)
     x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
     ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
     s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
     #开始训练
     epsilon = 0
     t = 0
     while "flappy bird" != "angry bird":
         a_t = self.epsilon_greedy(s_t, 0.0)
         #运动动作,与游戏环境交互一次
         x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
         x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)),
                             cv2.COLOR_BGR2GRAY)
         ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
         x_t1 = np.reshape(x_t1, (80, 80, 1))
         s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)
         #往前推进一步
         s_t = s_t1
Esempio n. 4
0
    def play_game(self):
        """
            This method trains the model to flappy birds.
            
            TODO: Insert more docs
        """

        #1. open up a game state to communicate with emulator
        flappy_bird = game.GameState()

        # get the first state by doing nothing.
        do_nothing = np.zeros(ACTIONS)
        do_nothing[0] = 1

        x_t, r_0, terminal = flappy_bird.frame_step(do_nothing)
        #run the selected action and observed next state and reward
        self.current_state = self.pre_process_state(x_t)

        while True:
            action, action_index = self.get_action()
            next_state, reward, terminal = flappy_bird.frame_step(action)
            next_state = self.scale_down_image(next_state)
            next_state = next_state.reshape(1, next_state.shape[0],
                                            next_state.shape[1], 1)  #1x84x84x1
            #print(type(next_state))
            self.experience_env(next_state, action_index, reward, terminal)
Esempio n. 5
0
    def train(self, episode, sampling_pool=sampling_pool):

        with graph.as_default():
            tqdm_e = tqdm(range(episode))
            for i in tqdm_e:
                env = game.GameState()
                state = env.reset()
                cum_r = 0
                done = False
                while not done:

                    state = im_processor(state)
                    state_newaxis = state[np.newaxis, :]
                    action = self.actor.explore(state_newaxis)
                    action_array = np.array([0, 0])
                    action_array[action] = 1
                    next_state, reward, done = env.step(action_array)
                    action_onehot = to_categorical(action, self.n_action)
                    ob = (state, reward, done, action_onehot, next_state)
                    sampling_pool.add_to_buffer(ob)
                    state = next_state
                    cum_r += reward

                self.update(sampling_pool)
                self.cum_r.append(cum_r)
                tqdm_e.set_description("Score: " + str(cum_r))
                tqdm_e.refresh()
                if (i > 10000) & (not (i % 10000)):
                    self.save_model(f"{i}-eps-.h5")
                del env
            self.save_model(f"final-{i}-eps-.h5")
Esempio n. 6
0
def init_flappybird():
    env = game.GameState()
    x_t, r_0, terminal = env.step(0)
    x_t = x_t.reshape(x_t.shape[1], x_t.shape[2])
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])
    return env, s_t
Esempio n. 7
0
    def __init__(self, model, path):
        """
        self.net            实例化模型
        self.path           模型保存路径
        self.game_state     游戏状态
        self.batch_s_t      t时刻的图像像素的batch
        self.batch_s_t1     t1时刻的图像像素的batch
        self.batch_a_t      t时刻的行动的batch
        self.batch_r        t时刻对应的奖励的batch
        self.y_batch        t时刻对应的奖励 加上 最大的模型输出值 × 系数,也就是对应的q-value
        self.s_t            t时刻的图像像素
        self.loss_data      loss的数值
        self.readout_t      t时刻的模型输出
        self.r_t            t时刻对应的奖励
        self.s_t1           t1时刻的图像像素
        self.action_index   行动下标
        self.t              步数,记录运行多少步
        self.loss_function  损失函数
        self.optimizers     优化器
        self.epsilon        系数
        self.D              数据队列
        self.load(path)     加载模型
        self.observe        观察步数
        self.explore        探索步数
        self.cuda           是否使用cuda

        :param model:
        """
        self.net = model()
        self.path = path
        self.game_state = game.GameState()
        self.batch_s_t = np.zeros([Batch, Channel, Width, High])
        self.batch_s_t1 = np.zeros([Batch, Channel, Width, High])
        self.batch_a_t = np.zeros([Batch, Actions])
        self.batch_r = np.zeros([Batch])

        self.s_t = self.get_state()
        self.loss_data = 0
        self.readout_t = None
        self.r_t = None
        self.s_t1 = None
        self.y_batch = np.zeros([Batch])
        self.action_index = 0
        self.t = 0
        # self.death = 0
        self.loss_function = nn.MSELoss()
        self.optimizers = optim.Adam(params=self.net.parameters(), lr=1e-8)
        self.epsilon = Initial_epsilon
        self.D = deque()
        self.load(path)
        self.observe = self.t + Observe
        self.explore = self.t + Explore
        self.cuda = False
        if torch.cuda.is_available():
            self.cuda = True
            self.net = self.net.cuda()
        print("begin time", time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.localtime()))
Esempio n. 8
0
    def train_Network(self,experience_buffer):
        #打开游戏状态与模拟器进行通信
        game_state = game.GameState()
        #获得第一个状态并将图像进行预处理
        do_nothing = np.zeros(ACTIONS)
        do_nothing[0]=1
        #与游戏交互一次
        x_t, r_0, terminal = game_state.frame_step(do_nothing)
        x_t = cv2.cvtColor(cv2.resize(x_t, (80,80)),cv2.COLOR_BGR2GRAY)
        ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
        s_t = np.stack((x_t, x_t, x_t, x_t),axis=2)
        #开始训练
        epsilon = INITIAL_EPSILON
        t= 0
        while "flappy bird"!="angry bird":
            a_t = self.epsilon_greedy(s_t,epsilon=epsilon)
            #epsilon递减
            if epsilon > FINAL_EPSILON and t>OBSERVE:
                epsilon -= (INITIAL_EPSILON-FINAL_EPSILON)/EXPLORE
            #运动动作,与游戏环境交互一次
            x_t1_colored, r_t,terminal = game_state.frame_step(a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 =np.reshape(x_t1,(80,80,1))
            s_t1 = np.append(x_t1, s_t[:,:,:3],axis=2)
            #将数据存储到经验池中
            experience = np.reshape(np.array([s_t,a_t,r_t,s_t1,terminal]),[1,5])
            print("experience", r_t,terminal)
            experience_buffer.add_experience(experience)
            #在观察结束后进行训练
            if t>OBSERVE:
                #采集样本
                train_s, train_a, train_r,train_s_,train_terminal = experience_buffer.sample(BATCH)
                target_q=[]
                read_target_Q = self.sess.run(self.Q_,{self.obs_:train_s_})
                for i in range(len(train_r)):
                    if train_terminal[i]:
                        target_q.append(train_r[i])
                    else:
                        target_q.append(train_r[i]+GAMMA*np.max(read_target_Q[i]))
                print(target_q)
                #训练一次
                self.sess.run(self.q_train_op, feed_dict={self.obs:train_s, self.action:train_a, self.Q_target:target_q})
                #更新旧的目标网络
                # if t%1000 == 0:
                self.sess.run(self.update_oldq_op)
            #往前推进一步
            s_t = s_t1
            t+=1
            #每10000次迭代保存一次
            if t%10000 == 0:
                self.save_model('saved_networks/',global_step=t)

            if t<=OBSERVE:
                print("OBSERVE",t)
            else:
                if t%1 == 0:
                    print("train, steps",t,"/epsilon", epsilon,"/action_index",a_t, "/reward",r_t)
Esempio n. 9
0
def play_game(options):
    """Play flappy bird with pretrained dqn model

       weight -- model file name containing weight of dqn
       best -- if the model is best or not
    """
    model = QNetwork()
    if options.ckpt_path is None:
        print ('you should give weight file name.')
        return
    print ('load previous model weight: {}'.format(options.ckpt_path))
    episode, epsilon = load_checkpoint(options.ckpt_path, model)

    if options.cuda:
        model = model.cuda()

    algorithm = DQN(model, optim, epsilon, options)

    algorithm.set_eval()
    bird_game = game.GameState()
    bird_game.FPS = 480

    action = [1, 0]
    o, r, terminal = bird_game.frame_step(action)
    o = preprocess(o)

    rpm = ReplayMemory(1, options)
    rpm.append(o, action, r, terminal)

    start = time.time()
    fc = 0
    score = 0
    while True:
        prev_o, a, r, o, terminal = rpm.sample(1)

        # q = algorithm(o).cpu().detach().numpy()[0]

        score = max(score, bird_game.score)
        action = algorithm.get_optim_action(o)
        o, r, terminal = bird_game.frame_step(action)
        
        o = preprocess(o)

        # img = Image.fromarray((o*255).astype(np.uint8)).convert(mode='L')
        # img.save(f'{fc}-{r}-{q.argmax()}.png')
        # fc += 1
        if terminal or score > options.max_score*2:
            break

        rpm.append(o, action, r, terminal)

    ela = time.time() - start
    print(f'Final Score {score}, FPS{bird_game.FPS}, {ela//60}m{ela%60}s')
    

# if __name__ == "__main__":
#     main()
Esempio n. 10
0
    def reset(self):
        self.env = game.GameState(1, False)

        # Reset the frame buffer with FRAME_BUFFER_SIZE frames
        for _ in range(FRAME_BUFFER_SIZE):
            frame, r, done = self.env.frame_step(onehot(0))
            self.frame_buffer.append(frame)

        return self._convert_process_buffer()
Esempio n. 11
0
def init():
    flappyBird = game.GameState()

    init_action = torch.IntTensor([1, 0])
    init_observation, _, _ = flappyBird.frame_step(init_action)

    init_observation = preprocess(init_observation)
    brain = RL_Brain(init_observation, INITIAL_EPSILON, TRAIN)
    return brain, flappyBird
Esempio n. 12
0
def play1(rl, score_graph_path, IMAGE_WIDTH, IMAGE_HEIGHT, finish_episode):
    from game import wrapped_flappy_bird as fb
    import numpy as np
    env = fb.GameState()

    # first action [1,0], choose do nothing
    do_nothing = np.zeros(rl.action_cnt)
    do_nothing[0] = 1

    img, r_0, terminal = env.frame_step(do_nothing)

    # image preprocessing
    img = resize_gray_binary(img, IMAGE_WIDTH, IMAGE_HEIGHT)
    s_t = np.stack((img, img, img, img), axis=2)

    episode = 0
    score_hist = []
    while True:
        # rl choose action based on current state
        a_t = rl.choose_action(s_t)

        # rl take action and get next image and reward
        img, r_t, terminal = env.frame_step(a_t)

        if r_t == 1:
            rl.score_per_episode += 1
            print(rl.score_per_episode)
        if terminal:
            episode += 1
            rl.score_per_episode = round(rl.score_per_episode, 3)
            summary, summary_score = rl.sess.run(
                [rl.summary_score, rl.score],
                feed_dict={rl.score: rl.score_per_episode})
            rl.writer.add_summary(summary, episode)
            score_hist.append(rl.score_per_episode)
            rl.score_per_episode = 0.0
            if episode >= finish_episode:
                break

        img = resize_gray_binary(img, IMAGE_WIDTH, IMAGE_HEIGHT)
        img = np.reshape(img, (IMAGE_WIDTH, IMAGE_HEIGHT, 1))
        s_t1 = np.append(img, s_t[:, :, :3], axis=2)

        # swap observation
        s_t = s_t1

    max_score = max(score_hist)
    min_score = min(score_hist)
    aver_score = np.average(score_hist)
    std_deviation = np.std(score_hist)
    with open(score_graph_path + 'result.txt', 'w') as f:
        f.write('%s\n' % score_hist)
        f.write('max: %d\n' % max_score)
        f.write('min: %d\n' % min_score)
        f.write('average: %d\n' % aver_score)
        f.write('std deviation: %d\n' % std_deviation)
Esempio n. 13
0
    def train(self, episode):

        with graph.as_default():
            tqdm_e = tqdm(range(episode))
            env = game.GameState()
            s = deque()
            a = deque()
            r = deque()
            d = deque()
            next_s = deque()
            for i in tqdm_e:
                state = env.reset()
                cum_r = 0
                done = False
                # state = np.squeeze(im_processor(state))
                state_stack = np.stack([state for i in range(STACK_NUM)],
                                       axis=2)

                while not done:
                    state_newaxis = state_stack[np.newaxis, :]
                    action = self.actor.explore(state_newaxis)
                    action_array = np.array([0, 0])
                    action_array[action] = 1
                    next_im, reward, done = env.step(action_array)
                    # next_im = im_processor(next_im)
                    next_state_stack = np.append(next_im,
                                                 state_stack[..., :-1],
                                                 axis=2)
                    action_onehot = to_categorical(action, self.n_action)

                    s.append(state_stack)
                    a.append(action_onehot)
                    r.append(reward)
                    d.append(done)
                    next_s.append(next_state_stack)
                    state_stack = next_state_stack
                    cum_r += reward

                self.cum_r.append(cum_r)
                tqdm_e.set_description("Score: " + str(cum_r))
                tqdm_e.refresh()

                # train

                self.update(s, r, d, a, next_s)
                s = deque()
                a = deque()
                r = deque()
                d = deque()
                next_s = deque()

                if (i > 10000) & (not (i % 50000)):
                    self.save_model(f"{i}-eps-.h5")

            self.save_model(f"final-{i}-eps-.h5")
Esempio n. 14
0
    def __init__(self):

        self.env = game.GameState(1, False)
        self.pixel_input = hasattr(Settings, 'CONV_LAYERS')

        self.frame_buffer = deque(maxlen=4)

        self.render = False
        self.gif = False
        self.name_gif = 'save_'
        self.n_gif = {}
        self.images = []
Esempio n. 15
0
def q_learning(mode, filename=None):

    if mode == 'test':
        TOTAL_OBSERVATION = 1_000
    else:
        TOTAL_OBSERVATION = 3_200

    observe = TOTAL_OBSERVATION
    epsilon = INITIAL_EPSILON

    # init network
    network = init_network(observe, epsilon, mode, filename)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    queue = deque(maxlen=REPLAY_MEMORY)

    s_t0 = get_init_stack(game_state)

    t = 0
    time0 = time.time()
    total_loss = 0
    while (True):
        action_index, r_t = 0, 0
        a_t = np.zeros([ACTIONS])
        action_index = chose_action(network, s_t0, a_t, t, epsilon)
        a_t[action_index] = 1

        # We reduced the epsilon gradually
        if epsilon > FINAL_EPSILON and t > observe:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / TOTAL_EXPLORE

        s_t1, r_t, terminal = get_next_stack(game_state, a_t, s_t0)

        queue.append((s_t0, action_index, r_t, s_t1, terminal))

        if t > observe:
            # only train if done observing
            loss, q_sa = train_network(queue, network)
        else:
            loss, q_sa = 0, 0

        total_loss += loss
        s_t0, t = s_t1, t + 1

        logging(mode, t, time0, network, observe, epsilon, action_index, r_t, q_sa, loss, total_loss, TOTAL_EXPLORE)

    print("Episode finished!")
    print("************************")
Esempio n. 16
0
def main(train=False, eval=False):
    game_state = game.GameState()

    bot = gamebot(get_model(), train)

    next_state, reward, terminal = game_state.frame_step(bot.NOTHING)

    next_state = bot.image_preprocessing(next_state)
    state = np.stack((next_state, next_state, next_state, next_state), axis=2)
    state = np.reshape(state, (1, *state.shape))

    if eval:
        results = []
        local_count = 0

    while True:
        action, action_index = bot.make_action(state)

        next_state, reward, terminal = game_state.frame_step(action)

        next_state = bot.image_preprocessing(next_state)
        next_state = next_state.reshape(1, *next_state.shape, 1)
        next_state = np.append(next_state, state[:, :, :, :3], axis=3)

        if train:
            bot.make_buffer(state, action_index, reward, next_state, terminal)
            train_index, loss = bot.make_train()
            print('Epoch: {} - loss: {}'.format(train_index, loss))
            if train_index == bot.EXPLORE:
                return

        if eval:
            if reward == 1:
                local_count += 1

            if reward == -1:
                results.append(local_count)
                print('{}: {} steps'.format(len(results), local_count))

                if len(results) == 100:
                    print('Min: {}'.format(np.min(results)))
                    print('Mean: {}'.format(np.mean(results)))
                    print('Max: {}'.format(np.max(results)))
                    return

                local_count = 0

        state = next_state
Esempio n. 17
0
def get_game_state():
    game_state = game.GameState()
    a_file = open('logs_' + GAME + "/readout.txt", 'w')
    h_file = open('logs_' + GAME + "/hidden.txt", 'w')
    # 初始化
    # 将图像转化为80*80*4 的矩阵
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    # 将图像转换成80*80,并进行灰度化
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_RGBA2GRAY)
    # 对图像进行二值化
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    # 将图像处理成4通道
    s_current = np.stack((x_t, x_t, x_t, x_t), axis=2)
    return s_current, game_state
Esempio n. 18
0
def playgame():
    dqn = Dqn()
    flappy_bird = game_interface.GameState()
    initial_action = np.array([1, 0])
    initial_frame, reward, terminal = flappy_bird.frame_step(initial_action)

    # initial
    initial_frame = cv2.cvtColor(cv2.resize(initial_frame, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, initial_frame = cv2.threshold(initial_frame,1,255,cv2.THRESH_BINARY)

    dqn.set_initial_state(initial_frame)
    while True:
        action = dqn.get_action()
        frame, reward, terminal = flappy_bird .frame_step(action)

        sample = preprocess(frame)
        dqn.save_transition(sample, action, reward, terminal)
def dummy_play():
    def random_action():
        action = np.zeros(2)
        action[np.random.randint(2)] = 1
        return action

    # dummy play using random action to see what happens
    game_state = game.GameState()
    action_t = np.zeros(2)
    action_t[0] = 1
    while True:
        a = random_action()
        print('get random action: ', a)
        frame, r, dead = game_state.frame_step(a)
        # original frame image shape is (288, 512, 3) => (72, 128, 1) resize and gray scale
        print(f'frame: {frame.shape}, reward: {r}, dead: {dead}')
        if dead:
            print('game over.')
            break
Esempio n. 20
0
def playFlappyBird():
    action = 2
    brain = DeepQNetworks(action)
    flappyBird = game.GameState()
    action0 = np.array([1, 0])
    observation0, reward0, terminal = flappyBird.frame_step(action0)
    observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
    ret, observation0 = cv2.threshold(observation0, 1, 1, cv2.THRESH_BINARY)
    brain.setInitState(observation0)

    while True:
        action = brain.getAction()
        score = flappyBird.score
        next_observation, reward, terminal = flappyBird.frame_step(action)
        next_observation = preprocess(next_observation)
        brain.setPerception(next_observation, action, reward, terminal)
        if terminal:
            brain.log_score(score)
Esempio n. 21
0
def playFlappyBird():
    # Step 1: init BrainDQN
    actions = 2
    brain = BrainDQN(actions)
    # Step 2: init Flappy Bird Game
    flappyBird = game.GameState()
    # Step 3: play game
    # Step 3.1: obtain init state
    action0 = np.array([1, 0])  # do nothing
    observation0, reward0, terminal = flappyBird.frame_step(action0)
    observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
    ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY)
    brain.setInitState(observation0)

    # Step 3.2: run the game
    while 1 != 0:
        action = brain.getAction()
        nextObservation, reward, terminal = flappyBird.frame_step(action)
        nextObservation = preprocess(nextObservation)
        brain.setPerception(nextObservation, action, reward, terminal)
Esempio n. 22
0
def init_or_restore_training_obj(savefile, x, sess, prediction):
    if os.path.exists(savefile):
        print("restore the game from savefile")
        save_obj = load_from_pickle(savefile)
        game_state = save_obj[0]
        replay = save_obj[1]
        curr_state = save_obj[2]
        esplion = save_obj[3]
        print(len(replay))
    else:
        print("init the game")
        esplion = config.ESPLION
        game_state = wrapped_flappy_bird.GameState()
        curr_state, replay = observation(game_state,
                                         esplion,
                                         x,
                                         sess,
                                         prediction,
                                         step=config.OBSERVATION_STEP)

    return game_state, replay, curr_state, esplion
Esempio n. 23
0
def playFlappyBird():
    flappyBird = game.GameState()

    action0 = np.array([1, 0])  # do nothing
    observation0, reward0, terminal = flappyBird.frame_step(action0)
    observation0 = preprocess(observation0, shape=(80, 80))

    try:
        with open('replayMemory.pkl', 'rb') as f:
            brain = pickle.load(f)
            print('load saved brain')
    except FileNotFoundError:
        print('cannot find saved brain, create a new brain')
        brain = Brain()
        brain.setInitState(observation0)

    while True:
        action = brain.getAction()
        nextObservation, reward, terminal = flappyBird.frame_step(action)
        nextObservation = preprocess(nextObservation)
        brain.setPerception(nextObservation, action, reward, terminal)
Esempio n. 24
0
    def __init__(self):
        self.flappyBird = wrapped_flappy_bird.GameState()
        self.experience_pool = []

        # get the first state by doing nothing and preprocess the image to 80x80x4
        do_nothing = np.zeros(ACTIONS)
        do_nothing[0] = 1  # one_hot: 0
        obser, reward, done = self.flappyBird.frame_step(do_nothing)
        obser = cv2.cvtColor(cv2.resize(obser, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, obser = cv2.threshold(obser, 1, 255, cv2.THRESH_BINARY)
        observation = np.stack((obser, obser, obser, obser),
                               axis=2)  # shape(80, 80, 4)

        # plt.ion()
        for i in range(MAX_MEMERY):
            if i % 5 == 0:
                index = np.random.randint(0, 2)
                action = np.zeros([2])
                action[index] = 1
            else:
                action = np.array([1, 0])

            next_obser, reward, done = self.flappyBird.frame_step(action)
            next_obser = self.preprocess(next_obser)
            next_observation = np.append(observation[:, :, 1:],
                                         next_obser,
                                         axis=2)
            # plt.clf()
            # plt.subplot(221)
            # plt.imshow(next_observation[:, :, 0])
            # plt.subplot(222)
            # plt.imshow(next_observation[:, :, 1])
            # plt.subplot(223)
            # plt.imshow(next_observation[:, :, 2])
            # plt.subplot(224)
            # plt.imshow(next_observation[:, :, 3])
            # plt.pause(0.01)
            self.experience_pool.append(
                [observation, reward, action, next_observation, done])
            observation = next_observation
Esempio n. 25
0
    def explore(self, act_police, episode=100):
        """增加explore的目的是 使用一个人为策略,收集一些靠谱的数据"""
        tqdm_e = tqdm(range(episode))
        env = game.GameState()
        print("explore")

        for i in tqdm_e:
            done = 0
            state = env.reset()
            act_police.reset()
            # state = np.squeeze(im_processor(state))
            state_stack = np.stack([state for i in range(STACK_NUM)], axis=2)
            s = deque()
            a = deque()
            r = deque()
            d = deque()
            next_s = deque()

            while not done:
                action = act_police.step()
                state_newaxis = state_stack[np.newaxis, :]
                action_array = np.array([0, 0])
                action_array[action] = 1
                next_im, reward, done = env.step(action_array)
                # next_im = im_processor(next_im)

                next_state_stack = np.append(next_im,
                                             state_stack[..., :-1],
                                             axis=2)
                action_onehot = to_categorical(action, self.n_action)

                s.append(state_stack)
                a.append(action_onehot)
                r.append(reward)
                d.append(done)
                next_s.append(next_state_stack)

            self.update(s, r, d, a, next_s)
Esempio n. 26
0
def main():
    begin_time = datetime.datetime.now()

    env = game.GameState()
    brain = DeepQNetwork(n_actions=N_ACTIONS,
                         memory_size=MEMORY_SIZE,
                         minibatch_size=MINIBATCH_SIZE,
                         gamma=GAMMA,
                         epsilon=INITIAL_EPSILON)

    step = 0
    for episode in range(MAX_EPISODE):
        # do nothing
        observation, _, _ = env.frame_step([1, 0])
        observation = preprocess(observation, False)
        brain.reset(observation)
        while True:
            action = brain.choose_action(observation)
            observation_, reward, done = env.frame_step(action)
            observation_ = preprocess(observation_, True)
            brain.store_transition(observation, action, reward, done,
                                   observation_)
            # 有一定的记忆就可以开始学习了
            if step > 200:
                brain.learn()

            if done:
                break

            observation = observation_
            step += 1

        end_time = datetime.datetime.now()
        print("episode {} over. exec time:{} step:{}".format(
            episode, end_time - begin_time, step))

    env.exit("game over")
def trainNetwork(model, args):
    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)

    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t, (80, 80))
    x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255))

    x_t = x_t / 255.0

    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    # print (s_t.shape)

    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  # 1*80*80*4

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        model.restore(sess)

        if args['mode'] == 'Run':
            OBSERVE = 999999999  # We keep observe, never train
            epsilon = 0
        else:  # We go to training mode
            OBSERVE = OBSERVATION
            epsilon = INITIAL_EPSILON

        t = 0
        while (True):
            episode_length = 0
            episode_reward = 0
            for iter_i in itertools.count():
                loss = 0
                Q_sa = 0
                action_index = 0
                r_t = 0
                a_t = np.zeros([ACTIONS])
                # choose an action epsilon greedy
                if t % FRAME_PER_ACTION == 0:
                    if random.random() <= epsilon:
                        # print("----------Random Action----------")
                        action_index = random.randrange(ACTIONS)
                        a_t[action_index] = 1
                    else:
                        q = model.predict(sess, s_t)  # input a stack of 4 images, get the prediction
                        max_Q = np.argmax(q)
                        action_index = max_Q
                        a_t[max_Q] = 1

                # We reduced the epsilon gradually
                if epsilon > FINAL_EPSILON and t > OBSERVE:
                    epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

                # run the selected action and observed next state and reward
                x_t1_colored, r_t, terminal = game_state.frame_step(a_t)

                episode_length += 1
                episode_reward += r_t

                x_t1 = skimage.color.rgb2gray(x_t1_colored)
                x_t1 = skimage.transform.resize(x_t1, (80, 80))
                x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

                x_t1 = x_t1 / 255.0

                x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)  # 1x80x80x1
                s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

                # store the transition in D
                D.append((s_t, action_index, r_t, s_t1, terminal))
                if len(D) > REPLAY_MEMORY:
                    D.popleft()

                # only train if done observing
                if t > OBSERVE:
                    # sample a minibatch to train on
                    minibatch = random.sample(D, BATCH)

                    # Now we do the experience replay
                    state_t, action_t, reward_t, state_t1, terminal_batch = zip(*minibatch)
                    state_t = np.concatenate(state_t)
                    state_t1 = np.concatenate(state_t1)
                    targets = model.predict(sess, state_t)
                    Q_sa = model.predict(sess, state_t1)
                    targets[range(BATCH), action_t] = reward_t + GAMMA * np.max(Q_sa, axis=1) * np.invert(
                        terminal_batch)

                    loss += model.update(sess, state_t, targets)

                    # save progress every 10000 iterations
                    if t % 1000 == 0:
                        print("Now we save model")
                        model.save(sess)

                s_t = s_t1
                t = t + 1

                # print info
                state = ""
                if t <= OBSERVE:
                    state = "observe"
                elif t > OBSERVE and t <= OBSERVE + EXPLORE:
                    state = "explore"
                else:
                    state = "train"

                if t % 100 == 0:
                    print("TIMESTEP", t, "/ EPISODE_LENGTH", episode_length, "/ STATE", state, \
                          "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
                          "/ Q_MAX ", np.max(Q_sa), "/ Loss ", loss)

                if terminal:
                    break

            # Add summaries to tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=episode_reward, node_name="episode_reward",
                                      tag="episode_reward")
            episode_summary.value.add(simple_value=episode_length, node_name="episode_length",
                                      tag="episode_length")
            model.train_writer.add_summary(episode_summary, sess.run(tf.train.get_global_step()))
            model.train_writer.flush()

        model.train_writer.close()
        model.validation_writer.close()
        print("Episode finished!")
        print("************************")
Esempio n. 28
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    a_file = open(os.path.normpath(os.path.join(os.path.dirname(
        os.path.abspath(__file__)), "logs_" + GAME + "/readout.txt")), 'w')
    h_file = open(os.path.normpath(os.path.join(os.path.dirname(
        os.path.abspath(__file__)), "logs_" + GAME + "/hidden.txt")), 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    epsilon = INITIAL_EPSILON
    t = 0
    while "flappy bird" != "angry bird":
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            else:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
        else:
            a_t[0] = 1 # do nothing

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # if terminal, only equals reward
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch}
            )

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, \
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX %e" % np.max(readout_t))
        # write info to files
        '''
Esempio n. 29
0
def train():
    game = flappy.GameState()
    game.frame_step()
Esempio n. 30
0
def trainNetwork(s, out, sess, istrain):
    # 定义损失函数
    x = tf.placeholder(float, [None, ACTIONS])
    y = tf.placeholder(float, [None])
    # tf.reduce_sum 计算一个张量的各个维度上元素的总和
    # api: reduce_sum(input_tensor , axis = None , keep_dims = False , name = None , reduction_indices = None)
    out_action = tf.reduce_sum(tf.multiply(out, x), reduction_indices=1)  # Q估计
    # 计算实际和预测结果的均方误差
    loss = tf.reduce_mean(tf.square(y - out_action))  # Q现实-Q估计
    # 定义反向传播方法
    # 学习率:决定参数每次更新的幅度 1e-6
    train_step = tf.train.AdamOptimizer(1e-6).minimize(loss)  # Adam优化器

    # 初始化游戏环节
    game_state = game.GameState()

    # 定义双向队列保存每轮的训练数据
    # 将每一轮观测存在D中,之后训练从D中随机抽取batch个数据训练,以打破时间连续导致的相关性,保证神经网络训练所需的随机性
    D = deque()

    # 初始化状态并且预处理图片,把连续的4帧图像作为一个输入
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1  # 初始化小鸟动作为不拍动翅膀
    # 将初始状态输入到游戏中 获取相应的反馈: 游戏图像 x_t,动作的奖励 r_0,游戏是否结束的标志 terminal
    x_t, r_0, terminal = game_state.frame_step(do_nothing)

    # 通过cv2模块的resize,cvtColor,threshold 将游戏图片转换为80*80的二值黑白图片
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    # threshold:固定阈值二值化
    # 图像的二值化就是将图像上的像素点的灰度值设置为0或255,这样将使整个图像呈现出明显的黑白效果
    # 图像的二值化使图像中数据量大为减少,从而能凸显出目标的轮廓
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)

    # 将连续4帧的图片作为神经网络的输入
    # np.stack函数是一个用于numpy数组堆叠的函数
    s_t = np.stack((x_t, x_t, x_t, x_t),
                   axis=2)  # 增加一维,新维度的下标为2 理解为将4张图片堆叠起来 维度变成三维

    # 加载保存的网络参数
    saver = tf.train.Saver()  # 实例化Saver对象
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # 开始训练
    # 初始化贪婪策略值epsilon
    epsilon = EPSILON
    t = 0  # 初始化时间戳
    while True:
        # 根据输入的s_t选择一个动作a_t
        out_t = out.eval(feed_dict={s: [s_t]})[0]  # 将s_t--初始输入作为参数喂入神经网络
        a_t = np.zeros([ACTIONS])  # 选择的动作
        action_index = 0

        # 每隔FRAME_PER_ACTION小鸟选择一次动作
        if t % FRAME_PER_ACTION == 0:
            # 贪心策略 ,有epsilon的几率随机选择动作去探索,否则选取Q值最大的动作
            if random.random() <= epsilon:  # epsilon几率下随机选择动作执行
                print("----------Random Action----------")  # 随机选择
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            else:  # 否则选取Q值最大的执行
                action_index = np.argmax(out_t)  # 返回最大值所在的索引号
                a_t[action_index] = 1
        else:
            a_t[0] = 1  # do nothing

        # 将选择的动作输入到游戏中,获取下一步游戏图像x_t1_colored,奖励r_t和结果terminal
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)),
                            cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)

        # 把这次行为的观测值(输入的图像s_t,执行的动作a_t,得到的奖励r_t,得到的图像s_t1和结果terminal存入队列D中
        D.append((s_t, a_t, r_t, s_t1, terminal))

        # 如果D满了则替换最早的数据
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # 若训练轮数超过观察轮数且istrain=true(允许训练),开始对数据进行训练
        if t > OBSERVE and istrain:
            # 随机抽取minibatch个数据进行训练
            # 从存储器中随机抽取BATCH组数据
            minibatch = random.sample(D, BATCH)

            # 获取BATCH个变量
            s_j_batch = [d[0] for d in minibatch]  # 图像
            a_batch = [d[1] for d in minibatch]  # 动作
            r_batch = [d[2] for d in minibatch]  # 奖励
            s_j1_batch = [d[3] for d in minibatch]  # 得到的图像

            # 估计奖励
            y_batch = []
            out_j1_batch = out.eval(feed_dict={s: s_j1_batch})

            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # 若terminal=true 则游戏结束,奖励值为r_batch[i]
                # 若terminal=false 则游戏继续,奖励值为r_batch[i]加上GAMMA*最大Q值 Q值推导式
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(out_j1_batch[i]))

            # 将估计奖励y_batch,动作a_batch和图像s_j_batch传入train_step进行训练
            # sess.run(train_step, feed_dict={y: y_batch, x: a_batch, s: s_j_batch})
            train_step.run(feed_dict={
                y: y_batch,  # 估计奖励
                x: a_batch,  # 动作
                s: s_j_batch
            })

        # 更新状态
        s_t = s_t1
        t += 1

        # 每1000轮保存一次网络数据
        if t % 1000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t)

        # 打印信息
        # 训练轮数、执行的奖励r_t、最大Q值
        print("TIMESTEP ", t, "| ACTION ", ACTION_NAME[action_index],
              " | REWARD ", r_t, " | Q_MAX %e" % np.max(out_t))