def main(): max_episodes = 1500 for episode in range(max_episodes): done = False step_count = 0 env.reset() obs, _, _, _, _, _, _, _, _ = env.step(0) cv2.imshow('mario', obs) cv2.waitKey(0) cv2.destroyAllWindows() while not done: state, reward, done, s4, s5, s6, s7, r_d, s_d = env.step(11) # 0 next_state:{ndarray} shape (90,90) # 1 reward:{int} # 2 done:{bool} # 3 state_clear:{bool} # 4 max_x:{int} # 5 time_out:{bool} # 6 now_x:{int} step_count += 1
def main(): max_episodes = 1500 for episode in range(max_episodes): done = False step_count = 0 env.reset() while not done: state, reward, done = env.step(env.get_random_actions()[0]) step_count += 1
def learn(env, MAX_EPISODE, EPS_START, EPS_END, EPS_DECAY, ACTION_NUM, REPLAY_MEMORY_CAPACITY, BATCH_SIZE, LOSS_FUNCTION, OPTIM_METHOD, LEARNING_RATE, GAMMA, NET_COPY_STEP, OBSERVE, TRAIN_FREQ, PATH): ### initialization ### action_space = [(0, Tensor([1, 0, 0, 0, 0, 0])), (3, Tensor([0, 1, 0, 0, 0, 0])), (7, Tensor([0, 0, 1, 0, 0, 0])), (11, Tensor([0, 0, 0, 1, 0, 0])), (4, Tensor([0, 0, 0, 0, 1, 0])), (8, Tensor([0, 0, 0, 0, 0, 1]))] # (action_button , action_onehot) # 以上动作分别为:不动、左走、右走、跳、左跳、右跳 value_net = dqn_net(ACTION_NUM) target_net = dqn_net(ACTION_NUM) if torch.cuda.is_available(): value_net.cuda() target_net.cuda() if os.path.isfile(PATH): value_net.load_state_dict(torch.load(PATH)) buffer = replay_memory(REPLAY_MEMORY_CAPACITY) env.reset() obs, _, _, _, _, _, _ = env.step(0) obs = ob_process(obs) obs4 = torch.cat(([obs, obs, obs, obs]), dim=0) # {Tensor} of shape torch.Size([4,84,84]) judge_distance = 0 episode_total_reward = 0 epi_total_reward_list = [] mean_reward_list = [] # counters # time_step = 0 update_times = 0 episode_num = 0 history_distance = 200 while episode_num <= MAX_EPISODE: ### choose an action with epsilon-greedy ### prob = random.random() threshold = EPS_END + (EPS_START - EPS_END) * math.exp( -1 * episode_num / EPS_DECAY) if prob <= threshold: action_index = np.random.randint(6) action_button = action_space[action_index][0] # {int} action_onehot = action_space[action_index][1] # {Tensor} else: action_button, action_onehot = value_net.select_action(obs4) ### do one step ### obs_next, reward, done, _, max_distance, _, now_distance = env.step( action_button) obs_next = ob_process(obs_next) obs4_next = torch.cat(([obs4[1:, :, :], obs_next]), dim=0) buffer.add(obs4.unsqueeze(0), action_onehot.unsqueeze(0), obs4_next.unsqueeze(0), Tensor([reward]).unsqueeze(0), done) episode_total_reward += reward if now_distance <= history_distance: judge_distance += 1 else: judge_distance = 0 history_distance = max_distance '''the transition added to buffer obs4: {ndarray} size (4,84,84) action: {list} size 6 e.g. [1,0,0,0,0,0] one hot list obs_next: {ndarray} size (84,84) reward: {int} done: {bool} ''' ### go to the next state ### if done == False: obs4 = obs4_next time_step += 1 elif done == True or judge_distance > 50: env.reset() obs, _, _, _, _, _, _ = env.step(0) obs = ob_process(obs) obs4 = torch.cat(([obs, obs, obs, obs]), dim=0) episode_num += 1 history_distance = 200 # plot graph # epi_total_reward_list.append(episode_total_reward) mean100 = np.mean(epi_total_reward_list[-101:-1]) mean_reward_list.append(mean100) plot_graph(mean_reward_list) print('episode %d total reward=%.2f' % (episode_num, episode_total_reward)) episode_total_reward = 0 ### do one step update ### if time_step >= OBSERVE and time_step % TRAIN_FREQ == 0: batch_transition = buffer.sample(BATCH_SIZE) '''{Transition} 0:{tuple} of {Tensor}-shape-torch.Size([4,84,84]) 1:{tuple} of {Tensor}-shape-torch.Size([6]) 2:{tuple} of {Tensor}-shape-torch.Size([4,84,84]) 3:{tuple} of {int} 4:{tuple} of {bool} ''' value_net.update(samples=batch_transition, loss_func=LOSS_FUNCTION, optim_func=OPTIM_METHOD, learn_rate=LEARNING_RATE, target_net=target_net, BATCH_SIZE=BATCH_SIZE, GAMMA=GAMMA) update_times += 1 ### copy value net parameters to target net ### if update_times % NET_COPY_STEP == 0: target_net.load_state_dict(value_net.state_dict()) torch.save(value_net.state_dict(), PATH)
def learn(env, MAX_EPISODE, EPS_START, EPS_END, EPS_DECAY, LEARNING_RATE, GAMMA, ): mapping_reduced_action = [3, 7, 11, 4, 10] Un_mapping_reduced_action = [100, 100, 100, 0, 3, 100, 100, 1, 100, 100, 4, 2] ### initialization ### env.reset() obs, _, _, _, _, _, _, _, _, _ = env.step(0) judge_distance = 0 episode_total_reward = 0 no_states_observed = 1 epi_total_reward_list = [] mean_reward_list = [] filename = 'State_Q_Table.csv' LEARNING_RATE_CTR = [np.zeros(6), np.zeros(6)] try: state_table = np.loadtxt(filename, delimiter=",", usecols=[0]) state_table = state_table.astype(int) state_table = state_table.tolist() q_table = np.loadtxt(filename, delimiter=",", usecols=[1, 2, 3, 4, 5, 6]) LEARNING_RATE_CTR = q_table * 0 q_table = q_table.tolist() LEARNING_RATE_CTR = LEARNING_RATE_CTR.tolist() no_states_observed = len(state_table) - 1 except: print('warning: Error %s: Loading State, Action Table' % filename) state_table = [0, 1] q_table = [np.random.rand(6), np.random.rand(6)] if (state_table == [] or q_table == []): state_table = [0, 1] q_table = [np.random.rand(6), np.random.rand(6)] LEARNING_RATE_CTR = [np.zeros(6), np.zeros(6)] # counters # time_step = 0 update_times = 0 episode_num = 0 history_distance = 200 index_s = 0 state_d_current = state_table[index_s] f_handle = open(filename, 'w') f_handle_Evo = open('State_Q_Table_ev.csv', 'a') controller_speed_ctr = 0 reward_collection = 0 print(state_table) print(q_table) while episode_num <= MAX_EPISODE: ### choose an action with epsilon-greedy ### prob = random.random() threshold = EPS_END # + (EPS_START - EPS_END) * math.exp(-1 * episode_num / EPS_DECAY) # action_onehot = action_space[0][1] # {Tensor} # if(controller_speed_ctr == 0): reward_collection = 0 if prob <= threshold: action_button_d = np.random.randint(6) else: action_button_d = np.argmax(q_table[index_s]) np.savetxt(f_handle_Evo, [np.concatenate([[state_d_current, action_button_d], q_table[index_s]])], fmt='%1.6f', delimiter=',') obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step( action_button_d) reward_collection += reward_d obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step( action_button_d) reward_collection += reward_d obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step( action_button_d) reward_collection += reward_d obs_next, reward, done, _, max_distance, _, now_distance, reward_d, state_d_next, keyboard_keys = env.step( action_button_d) reward_collection += reward_d if state_d_next in state_table: # start = time.clock() next_index = state_table.index(state_d_next) current_index = state_table.index(state_d_current) current_value = q_table[current_index][action_button_d] LEARNING_RATE_CTR[current_index][action_button_d] += 1 LEARNING_RATE_S_A = LEARNING_RATE / LEARNING_RATE_CTR[current_index][action_button_d] # print(LEARNING_RATE_S_A) q_table[current_index][action_button_d] = current_value + LEARNING_RATE_S_A * ( reward_d + GAMMA * (max(q_table[next_index])) - current_value) # print(current_value + LEARNING_RATE * (reward_d + GAMMA*(max(q_table[next_index])) - current_value)) index_s = next_index # print(q_table[current_index]) # print(current_index) # print(time.clock() - start) # print(np.concatenate(([state_table[current_index]], [reward_d], q_table[current_index]))) else: state_table.append(state_d_next) q_table.append(np.random.rand(6)) LEARNING_RATE_CTR.append(np.zeros(6)) no_states_observed = len(state_table) - 1 # no_states_observed + 1 index_s = no_states_observed # print(no_states_observed) print(np.concatenate(([state_table[no_states_observed]], [reward_d], q_table[no_states_observed]))) state_d_current = state_d_next episode_total_reward += reward_d if now_distance <= history_distance: judge_distance += 1 else: judge_distance = 0 history_distance = max_distance ### go to the next state ### if done == False: # obs4 = obs4_next time_step += 1 elif done == True or judge_distance > 50: env.reset() obs, _, _, _, _, _, _, _, _, _ = env.step(0) episode_num += 1 history_distance = 200 # plot graph # epi_total_reward_list.append(episode_total_reward) mean100 = np.mean(epi_total_reward_list[-101:-1]) mean_reward_list.append(mean100) plot_graph(epi_total_reward_list) print('episode %d total reward=%.2f' % (episode_num, episode_total_reward)) episode_total_reward = 0 np.savetxt(f_handle, np.column_stack((state_table, q_table)), fmt=','.join(['%i'] + ['%1.6f'] * 6), delimiter=',') np.savetxt('Reward.csv', epi_total_reward_list, fmt='%1.6f') np.savetxt('LR_CTR.csv', LEARNING_RATE_CTR, fmt='%i', delimiter=',') f_handle.close() f_handle_Evo.close()