Esempio n. 1
0
def playPlane():
    # Step 1: init BrainDQN
    actions = 3
    brain = BrainDQN(actions)
    # Step 2: init Plane Game
    plane = game.GameState()
    # Step 3: play game
    # Step 3.1: obtain init state
    action0 = np.array([1, 0, 0])  # [1,0,0]do nothing,[0,1,0]left,[0,0,1]right
    observation0, reward0, terminal = plane.frame_step(action0)

    observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
    ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY)
    brain.setInitState(observation0)

    # Step 3.2: run the game
    while 1 != 0:
        action = brain.getAction()
        nextObservation, reward, terminal = plane.frame_step(action)
        nextObservation = preprocess(nextObservation)
        brain.setPerception(nextObservation, action, reward, terminal)
Esempio n. 2
0
def main():
    begin_time = datetime.datetime.now()

    env = game.GameState()
    #env.display = ~TRAINING
    brain = rl_brain_pytorch.DeepQNetwork()

    step = 0
    for episode in range(rl_brain_pytorch.MAX_EPISODE):
        # do nothing
        observation, _, _ = env.frame_step([1, 0, 0])
        observation = preprocess(observation, False)
        brain.reset(observation)
        score = 0.0
        while True:
            action = brain.choose_action(observation)
            observation_, reward, done = env.frame_step(action)
            if reward == 1: score += 1
            observation_ = preprocess(observation_, True)
            if TRAINING:
                brain.store_transition(observation, action, reward, done,
                                       observation_)
            # 有一定的记忆就可以开始学习了
            if step > 200:
                if TRAINING:
                    brain.learn()

            if done:
                break

            observation = observation_
            step += 1

        end_time = datetime.datetime.now()
        print("episode {} over. exec time:{} step:{} score:{}".format(
            episode, end_time - begin_time, step, score))
    brain.saveNet()
    env.exit("game over")
Esempio n. 3
0
def main():
    begin_time = datetime.datetime.now()

    env = game.GameState()
    brain = DeepQNetwork(n_actions=N_ACTIONS,
                         memory_size=MEMORY_SIZE,
                         minibatch_size=MINIBATCH_SIZE,
                         gamma=GAMMA,
                         epsilon=INITIAL_EPSILON)

    step = 0
    for episode in range(MAX_EPISODE):
        # do nothing
        observation, _, _ = env.frame_step([1, 0, 0])
        observation = preprocess(observation, False)
        brain.reset(observation)
        while True:
            action = brain.choose_action(observation)
            observation_, reward, done = env.frame_step(action)
            observation_ = preprocess(observation_, True)
            brain.store_transition(observation, action, reward, done,
                                   observation_)
            # 有一定的记忆就可以开始学习了
            if step > 200:
                brain.learn()

            if done:
                break

            observation = observation_
            step += 1

        end_time = datetime.datetime.now()
        print("episode {} over. exec time:{} step:{}".format(
            episode, end_time - begin_time, step))

    env.exit("game over")
Esempio n. 4
0
def trainNetwork(s, readout, W_fc1, W_fc2, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS], name='action')
    y = tf.placeholder("float", [None], name='q_next')
    tf.summary.histogram('q_next', y)
    with tf.name_scope('q_eval'):
        readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                       reduction_indices=1)
        tf.summary.histogram('fc2/output', readout_action)
    with tf.name_scope('loss'):
        cost = tf.reduce_mean(tf.square(y - readout_action))
        tf.summary.scalar('loss', cost)
    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(cost)

    # network difference
    regularize_lambda = 1.0
    regularizer = tf.contrib.layers.l2_regularizer(
        regularize_lambda)  # equal to tf.nn.l2_loss

    with tf.name_scope('weight'):
        last_W_fc1 = tf.Variable(tf.constant(0.0, shape=W_fc1.get_shape()))
        diff_W_fc1 = tf.contrib.layers.apply_regularization(
            regularizer, [W_fc1 - last_W_fc1])
        tf.summary.scalar('diff_W_fc1', diff_W_fc1)
        last_W_fc1_update = tf.assign(last_W_fc1, W_fc1)

        last_W_fc2 = tf.Variable(tf.constant(0.0, shape=W_fc2.get_shape()))
        diff_W_fc2 = tf.contrib.layers.apply_regularization(
            regularizer, [W_fc2 - last_W_fc2])
        tf.summary.scalar('diff_W_fc2', diff_W_fc2)
        last_W_fc2_update = tf.assign(last_W_fc2, W_fc2)

    # tensorboard output
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter(r"result/Exp14Graph/", sess.graph)

    # record the reward
    with tf.name_scope('reward_per_life'):
        reward = tf.Variable(0.0, name='reward')
        reward_sum = tf.summary.scalar('reward_per_life', reward)

    # record the reward every 1000 time steps
    with tf.name_scope('reward_per_10000_steps'):
        reward_step = tf.Variable(0.0, name='reward_step')
        reward_sum_step = tf.summary.scalar('reward_per_10000_steps',
                                            reward_step)

    # placeholder to record reward
    reward_count = tf.placeholder('float')
    zero = tf.Variable(0.0, name='zero')
    re_count = 0.0
    life_count = 1
    reward_update = tf.assign(reward, reward + reward_count)
    reward_fresh = tf.assign(reward, zero)

    # placeholder to record reward_step
    reward_count_step = tf.placeholder('float')
    re_count_step = 0.0
    reward_update_step = tf.assign(reward_step,
                                   reward_step + reward_count_step)
    reward_fresh_step = tf.assign(reward_step, zero)

    # record average max q value
    with tf.name_scope('50kper_average_qMax'):
        q_max = tf.Variable(0.0, name='q_max_average')
        q_max_sum = tf.summary.scalar('50kper_average_qMax', q_max)
    # placeholder to record q value
    q_max_count = tf.placeholder('float')
    q_count = 0.0
    batch_count = 0
    q_max_update = tf.assign(q_max, q_max_count)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t_colored, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # variable to save game frame
    game_frame = x_t_colored

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state(r"result/Exp14_saved_networks")

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    epsilon = INITIAL_EPSILON
    omega = INITIAL_OMEGA
    t = 0
    episode_reward = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[
            0]  # returns output of current input(images).
        a_t = np.zeros([ACTIONS])
        action_index = 0
        ruleAction, safeAction = RuleAction3.rule_action(game_frame)
        if t % FRAME_PER_ACTION == 0:
            # choose action based on rule with a probability of omega - epsilon
            if safeAction == "safe":
                if random.random() <= omega:
                    if ruleAction == "left":
                        action_index = 1
                        a_t[action_index] = 1
                    elif ruleAction == "right":
                        action_index = 2
                        a_t[action_index] = 1
                    print("----------Rule Action----------")
                elif random.random() <= epsilon:
                    print("----------Random Action----------")
                    action_index = random.randrange(ACTIONS)
                    a_t[action_index] = 1
                else:
                    action_index = np.argmax(readout_t)
                    a_t[action_index] = 1
            else:
                print("----------safe Action----------")
                if safeAction == "left":
                    action_index = 1
                    a_t[action_index] = 1
                elif safeAction == "right":
                    action_index = 2
                    a_t[action_index] = 1
        else:
            a_t[0] = 1  # do nothing

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # scale down omega
        # if omega - epsilon > 0 and t > OBSERVE:
        #     omega -= (INITIAL_OMEGA - FINAL_OMEGA) / EXPLORE
        if t > OBSERVE:
            omega = INITIAL_OMEGA * (DECAY_RATE**(t / DECAY_STEPS))

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)),
                            cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # update game frame
        game_frame = x_t1_colored
        episode_reward += r_t

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)  # experience replay.

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(
                feed_dict={s: s_j1_batch})  # readout_j1_batch: Q value?

            # average max q value
            batch_count += 1
            q_count += np.max(readout_t)
            BATCH_N = 50000
            if batch_count % BATCH_N == 0:
                sess.run(q_max_update,
                         feed_dict={q_max_count: float(q_count / BATCH_N)})
                qm = sess.run(q_max_sum)
                writer.add_summary(qm, batch_count)
                q_count = 0.0

            # total reward
            re_count += r_t
            re_count_step += r_t

            if terminal:
                sess.run(reward_update,
                         feed_dict={reward_count: float(re_count)})
                re = sess.run(reward_sum)
                writer.add_summary(re, life_count)
                sess.run(reward_fresh)
                re_count = 0
                life_count += 1

            if t % 10000 == 0:
                sess.run(reward_update_step,
                         feed_dict={reward_count_step: float(re_count_step)})
                re_step = sess.run(reward_sum_step)
                writer.add_summary(re_step, t * 10000)
                sess.run(reward_fresh_step)
                re_count_step = 0

            for i in range(0, len(minibatch)):
                ith_terminal = minibatch[i][4]
                # if terminal, only equals reward
                if ith_terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(
                feed_dict={  # feed back to update network.
                    y: y_batch,
                    a: a_batch,
                    s: s_j_batch
                })

            # update tensorboard data per 100 steps
            if t % 1000 == 0:
                result = sess.run(merged,
                                  feed_dict={
                                      y: y_batch,
                                      a: a_batch,
                                      s: s_j_batch
                                  })
                writer.add_summary(result, t)

            # record network weight
            if t % 1000 == 0:
                sess.run([last_W_fc1_update, last_W_fc2_update])

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess,
                       'result/Exp14_saved_networks/' + GAME + '-dqn',
                       global_step=t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/OMEGA",
              omega, "/ ACTION", action_index, "/ REWARD", r_t,
              "/ EPISODE_REWARD", episode_reward,
              "/ Q_MAX %e" % np.max(readout_t), "/ TERMINAL", terminal)
        if terminal:
            episode_reward = 0