Exemple #1
0
def main():

    max_episodes = 1500
    for episode in range(max_episodes):
        done = False
        step_count = 0
        env.reset()

        while not done:
            state, reward, done = env.step(env.get_random_actions()[0])

            step_count += 1
Exemple #2
0
def main():
    max_episodes = 1500
    for episode in range(max_episodes):
        done = False
        step_count = 0
        env.reset()
        obs, _, _, _, _, _, _, _, _ = env.step(0)
        cv2.imshow('mario', obs)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        while not done:
            state, reward, done, s4, s5, s6, s7, r_d, s_d = env.step(11)
            # 0 next_state:{ndarray} shape (90,90)
            # 1 reward:{int}
            # 2 done:{bool}
            # 3 state_clear:{bool}
            # 4 max_x:{int}
            # 5 time_out:{bool}
            # 6 now_x:{int}
            step_count += 1
Exemple #3
0
def learn(env, MAX_EPISODE, EPS_START, EPS_END, EPS_DECAY, ACTION_NUM,
          REPLAY_MEMORY_CAPACITY, BATCH_SIZE, LOSS_FUNCTION, OPTIM_METHOD,
          LEARNING_RATE, GAMMA, NET_COPY_STEP, OBSERVE, TRAIN_FREQ, PATH):
    ### initialization ###
    action_space = [(0, Tensor([1, 0, 0, 0, 0, 0])),
                    (3, Tensor([0, 1, 0, 0, 0, 0])),
                    (7, Tensor([0, 0, 1, 0, 0, 0])),
                    (11, Tensor([0, 0, 0, 1, 0, 0])),
                    (4, Tensor([0, 0, 0, 0, 1, 0])),
                    (8, Tensor([0, 0, 0, 0, 0, 1]))]
    # (action_button , action_onehot)
    # 以上动作分别为:不动、左走、右走、跳、左跳、右跳
    value_net = dqn_net(ACTION_NUM)
    target_net = dqn_net(ACTION_NUM)
    if torch.cuda.is_available():
        value_net.cuda()
        target_net.cuda()
    if os.path.isfile(PATH):
        value_net.load_state_dict(torch.load(PATH))
    buffer = replay_memory(REPLAY_MEMORY_CAPACITY)
    env.reset()
    obs, _, _, _, _, _, _ = env.step(0)
    obs = ob_process(obs)
    obs4 = torch.cat(([obs, obs, obs, obs]),
                     dim=0)  # {Tensor} of shape torch.Size([4,84,84])
    judge_distance = 0
    episode_total_reward = 0
    epi_total_reward_list = []
    mean_reward_list = []
    # counters #
    time_step = 0
    update_times = 0
    episode_num = 0
    history_distance = 200
    while episode_num <= MAX_EPISODE:
        ### choose an action with epsilon-greedy ###
        prob = random.random()
        threshold = EPS_END + (EPS_START - EPS_END) * math.exp(
            -1 * episode_num / EPS_DECAY)
        if prob <= threshold:
            action_index = np.random.randint(6)
            action_button = action_space[action_index][0]  # {int}
            action_onehot = action_space[action_index][1]  # {Tensor}
        else:
            action_button, action_onehot = value_net.select_action(obs4)
        ### do one step ###
        obs_next, reward, done, _, max_distance, _, now_distance = env.step(
            action_button)
        obs_next = ob_process(obs_next)
        obs4_next = torch.cat(([obs4[1:, :, :], obs_next]), dim=0)
        buffer.add(obs4.unsqueeze(0), action_onehot.unsqueeze(0),
                   obs4_next.unsqueeze(0),
                   Tensor([reward]).unsqueeze(0), done)
        episode_total_reward += reward
        if now_distance <= history_distance:
            judge_distance += 1
        else:
            judge_distance = 0
            history_distance = max_distance
        '''the transition added to buffer
        obs4: {ndarray} size (4,84,84)
        action: {list} size 6 e.g. [1,0,0,0,0,0] one hot list
        obs_next: {ndarray} size (84,84)
        reward: {int}
        done: {bool}
        '''
        ### go to the next state ###
        if done == False:
            obs4 = obs4_next
            time_step += 1
        elif done == True or judge_distance > 50:
            env.reset()
            obs, _, _, _, _, _, _ = env.step(0)
            obs = ob_process(obs)
            obs4 = torch.cat(([obs, obs, obs, obs]), dim=0)
            episode_num += 1
            history_distance = 200
            # plot graph #
            epi_total_reward_list.append(episode_total_reward)
            mean100 = np.mean(epi_total_reward_list[-101:-1])
            mean_reward_list.append(mean100)
            plot_graph(mean_reward_list)
            print('episode %d total reward=%.2f' %
                  (episode_num, episode_total_reward))
            episode_total_reward = 0
        ### do one step update ###
        if time_step >= OBSERVE and time_step % TRAIN_FREQ == 0:
            batch_transition = buffer.sample(BATCH_SIZE)
            '''{Transition}
            0:{tuple} of {Tensor}-shape-torch.Size([4,84,84])
            1:{tuple} of {Tensor}-shape-torch.Size([6])
            2:{tuple} of {Tensor}-shape-torch.Size([4,84,84])
            3:{tuple} of {int}   
            4:{tuple} of {bool}        
            '''
            value_net.update(samples=batch_transition,
                             loss_func=LOSS_FUNCTION,
                             optim_func=OPTIM_METHOD,
                             learn_rate=LEARNING_RATE,
                             target_net=target_net,
                             BATCH_SIZE=BATCH_SIZE,
                             GAMMA=GAMMA)
            update_times += 1
            ### copy value net parameters to target net ###
            if update_times % NET_COPY_STEP == 0:
                target_net.load_state_dict(value_net.state_dict())

    torch.save(value_net.state_dict(), PATH)
def learn(env,
          MAX_EPISODE,
          EPS_START,
          EPS_END,
          EPS_DECAY,
          LEARNING_RATE,
          GAMMA,
          ):
    mapping_reduced_action = [3, 7, 11, 4, 10]
    Un_mapping_reduced_action = [100, 100, 100, 0, 3, 100, 100, 1, 100, 100, 4, 2]
    ### initialization ###
    env.reset()
    obs, _, _, _, _, _, _, _, _, _ = env.step(0)
    judge_distance = 0
    episode_total_reward = 0
    no_states_observed = 1
    epi_total_reward_list = []
    mean_reward_list = []
    filename = 'State_Q_Table.csv'
    LEARNING_RATE_CTR = [np.zeros(6), np.zeros(6)]
    try:
        state_table = np.loadtxt(filename, delimiter=",", usecols=[0])
        state_table = state_table.astype(int)
        state_table = state_table.tolist()
        q_table = np.loadtxt(filename, delimiter=",", usecols=[1, 2, 3, 4, 5, 6])
        LEARNING_RATE_CTR = q_table * 0
        q_table = q_table.tolist()
        LEARNING_RATE_CTR = LEARNING_RATE_CTR.tolist()
        no_states_observed = len(state_table) - 1
    except:
        print('warning: Error %s: Loading State, Action Table' % filename)
        state_table = [0, 1]
        q_table = [np.random.rand(6), np.random.rand(6)]
    if (state_table == [] or q_table == []):
        state_table = [0, 1]
        q_table = [np.random.rand(6), np.random.rand(6)]
        LEARNING_RATE_CTR = [np.zeros(6), np.zeros(6)]
    # counters #
    time_step = 0
    update_times = 0
    episode_num = 0
    history_distance = 200
    index_s = 0
    state_d_current = state_table[index_s]
    f_handle = open(filename, 'w')
    f_handle_Evo = open('State_Q_Table_ev.csv', 'a')
    controller_speed_ctr = 0
    reward_collection = 0
    print(state_table)
    print(q_table)
    while episode_num <= MAX_EPISODE:
        ### choose an action with epsilon-greedy ###
        prob = random.random()
        threshold = EPS_END  # + (EPS_START - EPS_END) * math.exp(-1 * episode_num / EPS_DECAY)
        # action_onehot = action_space[0][1] # {Tensor}

        # if(controller_speed_ctr == 0):
        reward_collection = 0
        if prob <= threshold:
            action_button_d = np.random.randint(6)
        else:
            action_button_d = np.argmax(q_table[index_s])
        np.savetxt(f_handle_Evo, [np.concatenate([[state_d_current, action_button_d], q_table[index_s]])], fmt='%1.6f',
                   delimiter=',')

        obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step(
            action_button_d)
        reward_collection += reward_d
        obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step(
            action_button_d)
        reward_collection += reward_d
        obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step(
            action_button_d)
        reward_collection += reward_d
        obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step(
            action_button_d)
        reward_collection += reward_d

        if state_d_next in state_table:
            # start = time.clock()
            next_index = state_table.index(state_d_next)
            current_index = state_table.index(state_d_current)
            current_value = q_table[current_index][action_button_d]
            LEARNING_RATE_CTR[current_index][action_button_d] += 1
            LEARNING_RATE_S_A = LEARNING_RATE / LEARNING_RATE_CTR[current_index][action_button_d]
            # print(LEARNING_RATE_S_A)
            q_table[current_index][action_button_d] = current_value + LEARNING_RATE_S_A * (
                        reward_d + GAMMA * (max(q_table[next_index])) - current_value)
            # print(current_value + LEARNING_RATE * (reward_d + GAMMA*(max(q_table[next_index])) - current_value))
            index_s = next_index
            # print(q_table[current_index])
            # print(current_index)
            # print(time.clock() - start)
            # print(np.concatenate(([state_table[current_index]], [reward_d], q_table[current_index])))
        else:
            state_table.append(state_d_next)
            q_table.append(np.random.rand(6))
            LEARNING_RATE_CTR.append(np.zeros(6))
            no_states_observed = len(state_table) - 1  # no_states_observed + 1
            index_s = no_states_observed
            # print(no_states_observed)
            print(np.concatenate(([state_table[no_states_observed]], [reward_d], q_table[no_states_observed])))
        state_d_current = state_d_next

        episode_total_reward += reward_d
        if now_distance <= history_distance:
            judge_distance += 1
        else:
            judge_distance = 0
            history_distance = max_distance

        ### go to the next state ###
        if done == False:
            # obs4 = obs4_next
            time_step += 1
        elif done == True or judge_distance > 50:
            env.reset()
            obs, _, _, _, _, _, _, _, _, _ = env.step(0)
            episode_num += 1
            history_distance = 200
            # plot graph #
            epi_total_reward_list.append(episode_total_reward)
            mean100 = np.mean(epi_total_reward_list[-101:-1])
            mean_reward_list.append(mean100)
            plot_graph(epi_total_reward_list)
            print('episode %d total reward=%.2f' % (episode_num, episode_total_reward))
            episode_total_reward = 0
    np.savetxt(f_handle, np.column_stack((state_table, q_table)), fmt=','.join(['%i'] + ['%1.6f'] * 6), delimiter=',')
    np.savetxt('Reward.csv', epi_total_reward_list, fmt='%1.6f')
    np.savetxt('LR_CTR.csv', LEARNING_RATE_CTR, fmt='%i', delimiter=',')
    f_handle.close()
    f_handle_Evo.close()