def main(): p1 = Agent() p2 = Agent() e = Env() p1.setSymbol(e.x) p2.setSymbol(e.o) p1.setV(e.initValues(p1.symbol)) p2.setV(e.initValues(p2.symbol)) for i in range(10000): if i % 1000 == 0: print "epoch: {}".format(i) play_game(p1, p2, Env()) print "Training Complete" human = Human() human.set_symbol(e.o) p1.verbose = True while True: play_game(p1, human, Env(), draw=True) answer = raw_input("Pay again? [y,n] :") if answer and answer.lower()[0] == 'n': break
def testEnv(): env = Env() channelThroughPut = 0 # fraction of time that packets are successfully delivered over the channel # i.e no collisions or idle time slots for iteration in range(config.Iterations): for t in range(config.TimeSlots): initialState = env.reset() for user in range(config.N): action = slottedAlohaProtocol() env.step(action=action, user=user) # each user changes the inner state of the environment where the environment uses the inner state # in order to keep track on the channels and the ACK signals for each user nextStateForEachUser, rewardForEachUser = env.getNextState() # if a reward is one that means that a packets was successfully delivered over the channel # the sum has a maximum of the number of channels -> config.K channelThroughPut = channelThroughPut + np.sum(rewardForEachUser) # measuring the expected value channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots) print("Channel Utilization average {}".format(channelThroughPut)) ToPlotX = range(config.Iterations * config.TimeSlots) ToPlotY = np.ones_like(ToPlotX) * channelThroughPut plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha", xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha") # # # def testTimeEnv(): # env = TimeDependentEnv() # channelThroughPut = 0 # fraction of time that packets are successfully delivered over the channel # # i.e no collisions or idle time slots # for iteration in range(config.Iterations): # TimeSPU = env.reset() # for t in range(config.TimeSlots): # env.resetTimeStep() # # reset the internal state of the environment # # which keep tracks of the users actions through out the time step # for user in range(config.N): # action = slottedAlohaProtocol() # env.step(action=action, user=user) # # each user changes the inner state of the environment where the environment uses the inner state # # in order to keep track on the channels and the ACK signals for each user # nextStateForEachUser, rewardForEachUser = env.tstep(timestep=t) # # if a reward is one that means that a packets was successfully delivered over the channel # # the sum has a maximum of the number of channels -> config.K # channelThroughPut = channelThroughPut + np.sum(rewardForEachUser) # # measuring the expected value # channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots) # print("Channel Utilization average {}".format(channelThroughPut)) # ToPlotX = range(config.Iterations * config.TimeSlots) # ToPlotY = np.ones_like(ToPlotX) * channelThroughPut # plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha", # xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha")
def env(): config = ConfigParser() config.read(["data.ini"]) # 1.使用环境变量读取的方式,需要吧evn设置为 data.ini下的环境名,并将"env":"test_env"设置到环境变量中 api_root_url = config[os.environ['env']]['api_root_url'] # 2.直接读取 # api_root_url = config['test_env']['api_root_url'] yield Env(api_root_url=api_root_url, username=os.environ["username"], password=os.environ["password"])
def test_agents(): result = np.zeros([5, 5]) maps, trials_per_map = 10, 10 ave_cost_1 = [] ave_cost_2 = [] ave_cost_3 = [] for j in range(maps): en = Env(50) cost_1 = [] cost_2 = [] cost_3 = [] # cost_4 = [] for k in range(trials_per_map): print(f'map: {j + 1}/{maps}, play: {k + 1}/{trials_per_map}') # en.set_target_on_type(i) en.set_target() en.print_target() agent_1 = Agent(en) searches_1, distance_1 = agent_1.run(1, False) sum_1 = searches_1 + distance_1 cost_1.append(sum_1) agent_2 = Agent(en) searches_2, distance_2 = agent_2.run(2, False) sum_2 = searches_2 + distance_2 cost_2.append(sum_2) agent_3 = Agent(en) searches_3, distance_3 = agent_3.run_improved(10000) sum_3 = searches_3 + distance_3 cost_3.append(sum_3) ave_cost_1.append(sum(cost_1) / len(cost_1)) ave_cost_2.append(sum(cost_2) / len(cost_2)) ave_cost_3.append(sum(cost_3) / len(cost_3)) result[0][1] = sum(ave_cost_1) / len(ave_cost_1) result[0][2] = sum(ave_cost_2) / len(ave_cost_2) result[0][3] = sum(ave_cost_3) / len(ave_cost_3) print(result)
import copy import pylab import numpy as np import tensorflow as tf from Environment import Env from Agent import PG from Agent import TUC import pickle np.random.seed(0) EPISODES = 50 env = Env() agent = PG() EP_reward_sums, episodes = [], [] #agent.save_model("./model_init/PG1") agent.load_model("./model_init/PG1") # Session settings GPU_mem_ratio = 0.2 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_mem_ratio) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # Create recomposed transition critic state_dim = 22 hidden_dim = 3 critic_hidden_dim = 2 action_dim = 5 tuc = TUC(sess, "TUC", state_dim, action_dim, 0.003) #tuc.save_model("./model_init/TUC1")
self.nb_episodes_random = 100 self.nb_episodes = 100 self.batch_size = 64 self.mission_file = './maze.xml' self.memory_capacity = 100000 self.gamma = 0.99 self.learning_rate = 0.001 self.epsilon = 0.2 self.huber_loss_delta = 2.0 self.update_target_frequency = 25 self.max_epsilon = 0.7 self.min_epsilon = 0.1 self.decreasing_rate = -math.log(0.01) / self.nb_episodes hps = HPS() plt.plot(hps.min_epsilon + (hps.max_epsilon - hps.min_epsilon) * np.exp(-hps.decreasing_rate * np.arange(hps.nb_episodes))) env = Env(hps.mission_file) randomAgent = RandomAgent(hps) play(env, hps, randomAgent, hps.nb_episodes_random, train=True) Agent = DDQNPER_Agent(hps) Agent.memory = randomAgent.memory = Agent.memory ##Agent.load() ##Agent.save() #Agent.epsilon = 0.15 play(env, hps, Agent, hps.nb_episodes, train=True, save_victory=False) #play(env, hps, Agent, 40, train=False, save_victory=True) #plt.plot(Agent.losses)
def empezarPrueba(): env = Env() env.width = 10 env.height = 6 env.posY = 6 # QTable : contiene los valores Q para cada par (estado, acción) qtable = np.random.rand(env.stateCount, env.actionCount).tolist() epochs = 100 # Iteraciones que realizará el algoritmo gamma = 0.8 # Valor que reduce las rescompensas de forma exponencial según pasan las acciones epsilon = 0.1 decay = 0.1 print("Mapa inicial") # Generamos el mapa del problema en concreto env.crearMapaPrueba() for i in range(epochs): # Reinicia el algoritmo en cada inicio de las iteraciones state, reward, done = env.reset() steps = 0 # Mientras que el algoritmo no llegue al estado final propuesto seguimos realizando acciones # y cambiando de estado while not done: print("epoch #", i + 1, "/", epochs) time.sleep(0.05) # Dibuja la nueva posición de la A en el mapa env.modificaMapa(i + 1) # Cuenta los pasos realizados hasta llegar el final steps += 1 # Cuando el parametro epsilon es mayor al numero generado automaticamente se realiza una acción # aleatoria if np.random.uniform() < epsilon: action = env.randomAction() # Si no selecciona la acción dependiendo de el numero mayor que encontramos en la tabla else: action = qtable[state].index(max(qtable[state])) # Calcula el siguiente estado, la recompensa obtenido y si el algoritmo acabó la iteración next_state, reward, done = env.step(action) # Hace que la recompensa sea menor en cada iteración reward = reward - (steps * 0.3) # Actualizamos la q-tabla con los valores de la ecuación de bellman pos = reward * (gamma**steps / 2) + 0.9 * max(qtable[next_state]) qtable[state][action] = pos # Si el algoritmo acaba dibuja la A en la posición final if done: env.fin(i + 1) # Muestra el mapa final por Tkinter if done and i + 1 == epochs: time.sleep(30) tk.mainloop() # Actualizamos el estado state = next_state # Epsilon se reduce en cada iteración para que el algoritmo haga menos elecciones aleatorias epsilon -= decay * epsilon print("\nDone in", steps, "steps".format(steps)) time.sleep(0.8)
import random from Environment import MetaEnvironment as Env f = open("query.txt") queryList = [] for line in f.readlines(): line = line.strip() queryList.append(line) env = Env(5) for i in range(5): traceList = queryList[i * 10:(i + 1) * 10] state = env.state(traceList) print(state) moveList = [ random.randint(0, env.server_num - 1) for _ in range(len(env.nodes)) ] env.take_actions(moveList) print('Loc:', env.locality()) print('Load:', env.load())
from tqdm import tqdm from collections import namedtuple StateVars = namedtuple('state_vars', ['curr_state', 'prev_state_hash', 'reward']) from Environment import State, Env need = pd.read_csv('../fake_4region_trip_20170510.csv') # dist=pd.read_csv('fake_4region_distance.csv') # dist=dist.values eps_num = 4 car_num = 1 env = Env(initial_region_state=[15, 15, 15, 15], capacity_each_step=10, max_episode=eps_num, car_count=car_num, need=need) history = {i: dict() for i in range(8)} for region in range(env.region_count): state = env.new_state() curr_state_hash = state.get_hash() state.out_stage() for car in range(env.car_num): for move in range(-env.capacity_each_step, env.capacity_each_step + 1): if state.check_feasible(region, car, move): new_state = state.step(region, car, move) new_state.in_stage() new_state_hash = new_state.get_hash()
from Environment import Env import numpy as np import pandas as pd initial_region_state=[15,15,15,15] capacity_each_step=10 max_episode=5 car_count=1 need=pd.read_csv('../fake_4region_trip_20170510.csv') env = Env(initial_region_state, capacity_each_step, max_episode, car_count, need) NUM_ACTIONS = (2 * env.capacity_each_step + 1) * env.region_count # [-500,500]*4个方块 NUM_STATES = 2 * env.region_count + 7 # MountainCar-v0: (2,) history_dict={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()} history_action={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()} state = env.init() print(state) for action in range(NUM_ACTIONS): env.reset() env.pre_step() move = action % (2 * env.capacity_each_step + 1) - env.capacity_each_step region = int(np.floor(action / (2 * env.capacity_each_step + 1))) if env.check_feasible(env.state,region,0,move): state,reward, recent_R=env.step(region,0,move) if (state in history_dict[0] and history_dict[0][state] < reward) \ or state not in history_dict[0]: history_dict[0][state] = (reward,recent_R) #记录 state->reward R history_action[0][state] =(move,region,reward) #记录 state->move region reward
def transcate_DDPG(self): BATCH_SIZE = 32 total_steps = 0 # 记录步数,一天是一步 profit_list = [] # 记录每局总收益 profitAdvanced_list = [] actions = 2 # 行动个数 s_dim = 87 a_dim = 1 brain = DDPG( a_dim=a_dim, s_dim=s_dim, a_bound=1., LR_A=0.001, LR_C=0.001, GAMMA=.99, TAU=0.01, # replacement=REPLACEMENT, ) gameNum = 0 #记录游戏轮数 ex_steps = 500 #探索衰减的轮数 epsilon = self.epsilon last_remainder = 0 reward_list = [0] #存储每次的收益,来计算baseline Loss_list = [] #存储训练过程中的损失值 wait_list = [] #记录N轮游戏分别等待天数 gameSplit = 5000 #每多少轮游戏画图 while total_steps < 60000: # 初始化游戏 # routeId = random.randrange(0, 49, 1) routeId = 21 self.routeline = self.allRoute[routeId] # print(self.routeline) env = Env(self.routeline) gameNum += 1 # state = env.getState() # 以state[0]、state[1]方式访问 today = env.getToday() terminal = False order_accepted = False isExploration = False create_date = 1 end_date = 0 stay_num = 0 # 一局游戏 # print("GAME#:",gameNum) baseline = 0 tao_prob = [] tao_reward = [] wait_day = [] #记录一局游戏等待哪些天 while today < self.routeline[-1] and terminal == False: # 有新订单产生 (当订单数已满10个时,此处不会收到新订单) if order_accepted == False: self.orderSelect(self.routeline, 60) # print(self.order) env.setOrder(self.order) order_accepted = True # 遍历self.orders(即state[0])字典,对每一个订单操作 state = env.getState() # 当前状态 state_tf = np.mat(state) # print(state_tf,len(state_tf)) # 由神经网络选择行动 if random.random() < epsilon and isExploration == False: isExploration = True # end_date = random.randrange(env.getTodayIndex(),87,1) end_date = 60 if isExploration: if env.getTodayIndex() == end_date: action_model = 1 if ex_steps > 0: ex_steps -= 1 else: action_model = 0 else: #action from learning action_model = brain.choose_action(state_tf) # print(action_model) wait_day.append(env.getTodayIndex()) # 订单字典 历史曲线 reward reward = env.getReward(action_model) tao_reward.append(reward) # 订单完成或者到最后一天 terminal = env.isTerminal(action_model) state_ = env.getNextState(action_model) if len(state_) == 1: state_ = copy.deepcopy(state) brain.store_transition(state, action_model, reward, state_) # profitAdvanced_list.append(td_error[0][0]) if brain.pointer > brain.MEMORY_CAPACITY: # print(b_s_) brain.learn() total_steps += 1 if terminal: # wait_list.append(wait_day[-1]) # loss = brain.learn() # Loss_list.append(loss) break # step 过一天加一 env.nextStep() # 一局的总收益 epsilon = self.epsilon * (ex_steps / 500) print("epsilon:", epsilon) print("TD_Error:", baseline) profit = env.getTotalReward() profit_list.append(profit) print("total_steps:", total_steps) print("profit_list", profit_list) print("profit:", profit, "profitAvg:", np.mean(profit_list)) print("action-prob:", tao_prob) print("Reward:", tao_reward) print("wait_day:", wait_day) self.writeHistory('./picture/history.txt', epsilon, baseline, total_steps, profit_list, profit, tao_prob, tao_reward, wait_day, gameNum) print("########################" + str(gameNum) + "###########################") if len(profit_list) >= gameSplit: plt.figure() plt.plot(profit_list, 'r-') plt.savefig('./picture/' + str(gameNum) + 'liner_profit_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), profit_list) plt.savefig('./picture/' + str(gameNum) + 'scatter_profit_PG.jpg') plt.figure() plt.plot(profitAdvanced_list, 'g-') plt.savefig('./picture/' + str(gameNum) + 'liner_advanced_PG.jpg') plt.figure() plt.plot(Loss_list, 'y-') plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), wait_list, c='r') plt.savefig('./picture/' + str(gameNum) + 'scatter_waitDay_PG.jpg') if len(profit_list) >= 500: profit_list.clear() wait_list.clear()
def transcate_PG(self): total_steps = 0 # 记录步数,一天是一步 profit_list = [] # 记录每局总收益 profitAdvanced_list = [] actions = 2 # 行动个数 brain = PolicyGradient( n_actions=2, n_features=87, learning_rate=0.1, reward_decay=1, ) gameNum = 0 #记录游戏轮数 ex_steps = 500 #探索衰减的轮数 epsilon = self.epsilon last_remainder = 0 reward_list = [0] #存储每次的收益,来计算baseline Loss_list = [] #存储训练过程中的损失值 wait_list = [] #记录等待天数 gameSplit = 500 #每多少轮游戏画图 while total_steps < 60000: # 初始化游戏 # routeId = random.randrange(0, 49, 1) routeId = 21 self.routeline = self.allRoute[routeId] # print(self.routeline) env = Env(self.routeline) gameNum += 1 # state = env.getState() # 以state[0]、state[1]方式访问 today = env.getToday() terminal = False order_accepted = False isExploration = False create_date = 1 end_date = 0 stay_num = 0 # 一局游戏 # print("GAME#:",gameNum) baseline = 0 tao_prob = [] tao_reward = 0 wait_day = [] while today < self.routeline[-1] and terminal == False: # 有新订单产生 (当订单数已满10个时,此处不会收到新订单) if order_accepted == False: self.orderSelect(self.routeline, 60) # print(self.order) env.setOrder(self.order) order_accepted = True # print(self.order[1]) # 遍历self.orders(即state[0])字典,对每一个订单操作 state = env.getState() # 当前状态 state_tf = np.mat(state) # print(state_tf,len(state_tf)) # 由神经网络选择行动 if random.random() < epsilon and isExploration == False: isExploration = True end_date = random.randrange(env.getTodayIndex(), 87, 1) # end_date = 60 if isExploration: if env.getTodayIndex() == end_date: action_model = 1 if ex_steps > 0: ex_steps -= 1 else: action_model = 0 else: #action from learning action_model, p = brain.choose_action( state_tf, env.getTodayIndex()) tao_prob.append(p) if action_model == 0: action_finishOrder = [1, 0] else: action_finishOrder = [0, 1] # 订单字典 历史曲线 reward reward = env.getReward(action_model) # 订单完成或者到最后一天 terminal = env.isTerminal(action_model) if terminal: tmp = reward baseline = np.mean(reward_list) profitAdvanced_list.append(baseline) reward -= baseline reward_list.append(tmp) # print("END_REWARD:",reward,",reward_list:",reward_list) # 保存记录到记忆库 # print("this is store arg:",state_tf,";", action_model,";", reward,";", env.getTodayIndex()) brain.store_transition(state_tf, action_model, reward, env.getTodayIndex()) # print(action_model) total_steps += 1 if terminal: loss, wait_day, tao_reward = brain.learn() Loss_list.append(loss) wait_list.append(wait_day[-1]) break # step 过一天加一 env.nextStep() # 一局的总收益 epsilon = self.epsilon * (ex_steps / 500) print("epsilon:", epsilon) print("Baseline:", baseline) profit = env.getTotalReward() profit_list.append(profit) print("total_steps:", total_steps) print("profit_list", profit_list) print("profit:", profit, "profitAvg:", np.mean(profit_list)) print("action-prob:", tao_prob) print("Reward:", tao_reward) print("wait_day:", wait_day) self.writeHistory('./picture/history.txt', epsilon, baseline, total_steps, profit_list, profit, tao_prob, tao_reward, wait_day, gameNum) print("########################" + str(gameNum) + "###########################") if len(profit_list) >= gameSplit: plt.figure() plt.plot(profit_list, 'r-') plt.savefig('./picture/' + str(gameNum) + 'liner_profit_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), profit_list) plt.savefig('./picture/' + str(gameNum) + 'scatter_profit_PG.jpg') plt.figure() plt.plot(profitAdvanced_list, 'g-') plt.savefig('./picture/' + str(gameNum) + 'liner_advanced_PG.jpg') plt.figure() plt.plot(Loss_list, 'y-') plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), wait_list, c='r') plt.savefig('./picture/' + str(gameNum) + 'scatter_waitDay_PG.jpg') profit_list.clear() wait_list.clear()
utility_next = utility_matrix[next_state] delta = reward + gamma * utility_next - utility utility_matrix[state] += alpha * (delta) return utility_matrix, delta def updateActor(state_action_matrix, state, action, delta): beta = 1 state_action_matrix[state, action] += beta * delta return state_action_matrix if __name__ == '__main__': environment = Env() alpha = 0.1 gamma = 0.99 epsilon = 0.01 number_of_episodes = 10000 state_action_pairs = numpy.full( (environment.normalizedtree.getNumberOfNodes(), 2), 0.5) utility_matrix = numpy.zeros( [environment.normalizedtree.getNumberOfNodes()]) softmax = lambda vals: numpy.exp(vals - numpy.max(vals)) / numpy.sum( numpy.exp(vals - numpy.max(vals)))