def trainNetwork(s, readout, h_fc1, sess):
    # 定义损失函数
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                   reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # 开启游戏模拟器,会打开一个模拟器的窗口,实时显示游戏的信息
    game_state = game.GameState()

    # 创建双端队列用于存放replay memory
    D = deque()

    # 获取游戏的初始状态,设置动作为空操作,并将初始状态修改成80*80*4大小
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # 用于加载或保存网络参数
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # 开始训练
    epsilon = INITIAL_EPSILON
    t = 0
    # 这个是个死循环吧...
    while "flappy bird" != "angry bird":
        # 使用epsilon贪心策略选择一个动作
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if t % FRAME_PER_ACTION == 0:
            # 执行一个随机动作
            if random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            # 由神经网络计算的Q(s,a)值选择对应的动作
            else:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
        else:
            a_t[0] = 1  # 不执行跳跃动作

        # 随游戏的进行,不断降低epsilon,减少随机动作
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # 执行选择的动作,并获得下一状态及回报
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)),
                            cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)

        # 将状态转移过程存储到D中,用于更新参数时采样
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # 过了观察期,才会进行网络参数的更新
        if t > OBSERVE:
            # 从D中随机采样,用于参数更新
            minibatch = random.sample(D, BATCH)

            # 分别将当前状态、采取的动作、获得的回报、下一状态分组存放
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            # 计算Q(s,a)的新值
            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # 如果游戏结束,则只有反馈值
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # 使用梯度下降更新网络参数
            train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch})

        # 状态发生改变,用于下次循环
        s_t = s_t1
        t += 1

        # 每进行10000次迭代,保留一下网络参数
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t)

        # 打印游戏信息
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, \
              "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
Esempio n. 2
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    #train_step = tf.train.RMSPropOptimizer(0.00025, 0.95, 0.95, 0.01).minimize(cost)
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = []

    # printing
    '''
    a_file = open("logs_" + GAME + "/readout.txt", 'w')
    h_file = open("logs_" + GAME + "/hidden.txt", 'w')
    '''

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"

    epsilon = INITIAL_EPSILON
    t = 0
    while "pigs" != "fly":
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            x_t1_col, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, 1:], axis=2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.pop(0)

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(
            readout_t)

        # write info to files
        '''
Esempio n. 3
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder(
        "float",
        [None, ACTIONS])  #placeholder actions. Vector que tendra el Q valor
    y = tf.placeholder(
        "float",
        [None])  # placeholder who really maxs la accion. Seraa el Q valor
    # readout, is the output of the network
    readout_action = tf.reduce_sum(
        tf.multiply(readout, a), reduction_indices=1
    )  #Q valor de acciones * acciones elegidas -> Q valor de accions elegidas
    #Entrenas para que readout sea el valor Q
    cost = tf.reduce_mean(tf.square(y - readout_action))
    #El coste es el cuadrado de la resta del valor Q (el valor de la accion, cuanto mayor mejor) de la verdadera, menos el valor Q de la que has elegido
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    a_file = open("logs_" + GAME + "/readout.txt", 'w')
    h_file = open("logs_" + GAME + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[
        0] = 1  # Primera acion sera no hacer nada, que es el primer valor
    x_t, r_0, terminal = game_state.frame_step(
        do_nothing
    )  # Cuadno el das una accion al juego, te da el estado (el siguiente frame, la recompensa y si es terminal)
    x_t = cv2.cvtColor(
        cv2.resize(x_t, (80, 80)),
        cv2.COLOR_BGR2GRAY)  #x_T es el frame hecho resize y en blanco y negro
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    # s_t es el stack de 4 frames que tienes que ar de input pero al princpio no tienes y tienes que pasarle 4 iguales

    # saving and loading networks
    saver = tf.train.Saver()
    tf.global_variables_initializer().run()

    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"
        start = time.time()
    epsilon = INITIAL_EPSILON
    t = 0  #timestep=frames
    while "pigs" != "fly":

        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[
            0]  #evaluas la red sobre el primer input y te da el valor esprado
        a_t = np.zeros([ACTIONS])  #accion a tomar
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:  # EXPLORAR
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else:  # EXPLOTACION
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # Aqui esta a_t, la accion a tomar [0,0,1,0,0,0] cone xplotacion o exploracion segun el valor de epsilon
        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
            #si estamos en observacion, ir decremtnando epsilon cada vez mas

        for i in range(
                0, K
        ):  #Esto tendria que ser solo un paso, es aplicar K veces la accion seleccionada
            # run the selected action and observe next state and reward
            x_t1_col, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2)

            # AHORA TENEMOS NUEVO s_t1 PARA EL SIGUIENTE ESTADO

            # store the transition in D, DE LA CUAL SACARAS AL AZAR

            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing SI ESTAS EXPLORANDO O ENTRENANADO,
        if t > OBSERVE:
            # sample a minibatch to train on (MINIBATCH DE LA MEMORIA)
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]  # el s_t
            a_batch = [d[1] for d in minibatch]  # el a_t (accion)
            r_batch = [d[2] for d in minibatch]  # r_t :reward
            s_j1_batch = [d[3] for d in minibatch
                          ]  # s_t1 : resultado despues de ejecutarlo

            y_batch = []  #L
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):  # CALCULAR VALOR Y (RECOMPENSA)
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(
                feed_dict={
                    y:
                    y_batch,  # si es estado final, y=recompensa del estado final. si no, y= recomepnsa de ese estado +GAMMA*recompensamaxima del siguiente estado
                    a:
                    a_batch,  # Accion a tomar (0,0,0,1,0,0). La que ha dado el sampleo de minibaatch
                    s: s_j_batch
                }
            )  #s: input images, las que ha dado directamente el sampleo de minibatch

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t)

        if t % 1000 == 0:
            print(time.time() - start)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ LINES", game_state.total_lines, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(
            readout_t)

        # write info to files
        '''