def perform_rollout(self, theta, inner=False): memory = Memory(self.hp) (s1, s2), _ = self.env.reset() for t in range(self.hp.len_rollout): a1, lp1 = self.act(s1, self.theta) a2, lp2 = self.act_opp(s2, theta) if self.id > 0: (s2, s1), (r2, r1), _, _ = self.env.step((a2, a1)) else: (s1, s2), (r1, r2), _, _ = self.env.step((a1, a2)) r1 = torch.Tensor(r1) r2 = torch.Tensor(r2) if inner: memory.add(lp2, lp1, r2) else: memory.add(lp1, lp2, r1) return memory
def Game(max_ep_len=1000, num_frames=4): global exit_game global actions env = gym.make('CarRacing-v0') state_dim = env.observation_space.shape action_dim = env.action_space.shape print(f"State: {state_dim}") print(f"Action: {action_dim}") # set interrupts env.reset() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release # make global actions array actions = np.zeros(4, dtype=np.float32) # mem memory = Memory() memory.create(state_dim, action_dim) # logger ep_ret_log = [] # init environment obs, ep_ret, ep_len, epoch = env.reset(), 0, 0, 0 obs = np.expand_dims(obs, axis=0) state_stack = np.repeat(obs, num_frames, axis=0) print(state_stack.shape) print(state_stack.dtype) # main loop while exit_game == False: # render window env.render() # take action obs2, r, d, _ = env.step(actions[:3]) obs2 = np.expand_dims(obs2, axis=0) state_stack = np.append(state_stack[1:], obs2, axis=0) # statistics ep_ret += r ep_len += 1 # Ignore the 'done' signal d = False if ep_len == max_ep_len else d # store in memory memory.add(state_stack, np.array(actions[:3]), r, d) # End of episode if d or (ep_len == max_ep_len): print(f"Epoch: {epoch}, EpRet: {ep_ret}, EpLen: {ep_len}, ReplayBuff: {len(memory)}") # if exists statistical data if len(ep_ret_log) > 0: log = np.array(ep_ret_log) print("AvgEpRet:", log.mean()) print("StdEpRet:", log.std()) print("MaxEpRet:", log.max()) print("MinEpRet:", log.min()) print() ep_ret_log.append(ep_ret) obs, ep_ret, ep_len = env.reset(), 0, 0 obs = np.expand_dims(obs, axis=0) state_stack = np.repeat(obs, num_frames, axis=0) epoch += 1 print('\n') # save the dataset memory.save()
def marl_test(config): experiment_name = config.setdefault("experiment_name", "") time_slots = config.setdefault("time_slots", 10000) simulations = config.setdefault("simulations", 3) memory_size = config.setdefault("memory_size", 1200) pretrain_length = config.setdefault("pretrain_length", 6) step_size = config.setdefault("step_size", 5) save_freq = config.setdefault("save_freq", 1000) save_results = config.setdefault("save_results", True) save_model = config.setdefault("save_model", False) load_model = config.setdefault("load_model", False) load_slot = config.setdefault("load_slot", 4999) training = config.setdefault("training", False) episode_interval = config.setdefault("episode_interval", 25) explore_step = config.setdefault("explore", 2000) greedy_step = config.setdefault("greedy", 20000) training_stop = config.setdefault("training_stop", 20000) # Stop the training after these time step. train_after_episode = config.setdefault("train_after_episode", False) # Train after each episode in stead of training after each time slot. global_reward_avg = config.setdefault("global_reward_avg", False) # Train after each episode in stead of training after each time slot. save_positions = config.setdefault("save_positions", False) # Train after each episode in stead of training after each time slot. enable_channel = config.setdefault("enable_channel", False) # Train after each episode in stead of training after each time slot. batch_size = config["RLAgent"]["batch_size"] ia_penalty_enable = config.setdefault("ia_penalty_enable", False) ia_averaging = config.setdefault("ia_averaging", False) for simulation in range(simulations): print("-=-=-=-=-=-=-=-=-=-=-= experiment_name: " + experiment_name + " SIMULATION " + str(simulation + 1) + " =-=-=-=-=-=-=-=-=-=-=-") # Initialize the env. env = TestEnv(**config["EnvironmentTest"]) if ia_penalty_enable: ia_penalty_threshold = config.setdefault("ia_penalty_threshold", 5) ia_penalty_value = config.setdefault("ia_penalty_value", -10) ia_penalty_counter = {} previous_actions = {} # store the previous taken action by the UE. num_users = env.get_total_users() for user in range(num_users): ia_penalty_counter[user] = 0 previous_actions[user] = -1 # Initialize the agen mainDRQN = DRQN(env, name=experiment_name, total_episodes=time_slots/episode_interval, **config["RLAgent"]) #mainDRQN = DeepRecurrentQNetwork(env=env, name=experiment_name, **config["RLAgent"]) if load_model: print("Load model DRQN time step " + str(load_slot)) save_dir = "save_model/" + "test/" mainDRQN.load_model(save_dir, load_slot) # this is experience replay buffer(deque) from which each batch will be sampled and fed to the neural network for training memory = Memory(max_size=memory_size) log_reward_slot = [] log_actions_slot = [] log_ia_slot = [] sum_ia_prev = 0 log_x_positions = [] start_time = time.time() episode = 0 # Used to update the greediness of the algorithm # cumulative reward cum_r = [0] cum_r_slots = [0] # cumulative collision cum_collision = [0] cum_collision_slots = [0] # this is our input buffer which will be used for predicting next Q-values history_input = deque(maxlen=step_size) # env.network.reset_ia() # to sample random actions for each user action = env.sample() #obs = env.step(action) obs, rews = env.my_step(action, 0) rews = list(rews) state = env.obtain_state(obs, action, rews) # reward = [i[1] for i in obs[:num_users]] num_users = env.get_total_users() num_channels = env.get_action_space() ############################################## for ii in range(pretrain_length*step_size*5): action = env.sample() if enable_channel: obs, reward = env.my_step_ch(action, 0) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] else: #obs, reward = env.my_step( # action, 0) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] obs, reward = env.my_step_design(action, 0) # obs is a list of tuple with [[(ACK,REW) for each user] ,CHANNEL_RESIDUAL_CAPACITY_VECTOR] next_state = env.obtain_state(obs, action, rews) #next_state = env.state_generator(action, obs) memory.add((state, action, rews, next_state)) state = next_state history_input.append(state) ############################################## # TODO: now load the positions env.load_saved_positions() for time_step in range(time_slots): #initializing action vector action = np.zeros([num_users], dtype=np.int32) #converting input historskyy into numpy array # TODO: enable below for lstm state_vector = np.array(history_input) # LSTM # state_vector = state # DQN for each_user in range(num_users): #action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, time_slot=time_step) if time_step < explore_step and not load_model: # and 0: action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode, policy="explore") elif time_step < greedy_step and not load_model: # and 0: action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode) else: action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode, policy="greedy") # taking action as predicted from the q values and receiving the observation from the envionment # obs = env.step(action) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] if save_positions: user_pos = env.get_x_pos() log_x_positions.append(user_pos) if enable_channel: obs, reward = env.my_step_ch(action, time_step) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] else: obs, reward = env.my_step(action, time_step) # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)] #obs, reward = env.my_step_design(action, time_step) # TODO: update the env topology after each step. log_actions_slot.append(action) ia = env.network.get_information_age(time_step) ia_sum = calculate_ia_penalty(ia) log_ia_slot.append(ia) if ia_averaging: # ia based penalty to the reward ia_penalty = 0 if ia_sum > sum_ia_prev: ia_penalty = -1 elif ia_sum < sum_ia_prev: ia_penalty = 1 sum_ia_prev = ia_sum # Generate next state from action and observation # next_state = env.state_generator(action, obs) used for DQN next_state = env.obtain_state(obs, action, reward, episode, mainDRQN.get_eps()) # print (next_state) # reward for all users given by environment #reward = [i[1] for i in obs[:num_users]] # calculating sum of rewards sum_r = np.sum(reward) #calculating cumulative reward cum_r.append(cum_r[-1] + sum_r) cum_r_slots.append(cum_r_slots[-1] + sum_r) #If NUM_CHANNELS = 2 , total possible reward = 2 , therefore collision = (2 - sum_r) or (NUM_CHANNELS - sum_r) collision = num_channels - sum_r #calculating cumulative collision cum_collision.append(cum_collision[-1] + collision) cum_collision_slots.append(cum_collision_slots[-1] + collision) ############################# # for co-operative policy we will give reward-sum to each user who have contributed # to play co-operatively and rest 0 # NOTE: I think, I do not need that part since I already use positive and negative reward. for i in range(len(reward)): # for each user we have this. #if reward[i] > 0: if ia_averaging: # add penalty based on the direction of the Information age. reward[i] += ia_penalty if ia_penalty_enable: if reward[i] < 1 and action[i] == previous_actions[i]: ia_penalty_counter[i] += 1 else: ia_penalty_counter[i] = 0 if ia_penalty_counter[i] > ia_penalty_threshold: reward[i] = ia_penalty_value previous_actions[i] = action[i] if global_reward_avg: reward[i] = reward[i] + sum_r/len(reward) # Add the average total reward to each UE. ############################# #reward = reward*2 # Add the average total reward to each UE. log_reward_slot.append(sum_r) # print (reward) # print("EPOCH " + str(time_step)) # add new experiences into the memory buffer as (state, action , reward , next_state) for training memory.add((state, action, reward, next_state)) state = next_state #add new experience to generate input-history sequence for next state history_input.append(state) # Start training. if not train_after_episode: if time_step < training_stop and training: #and not load_model: mainDRQN.train(memory, time_step) if time_step%(episode_interval) == episode_interval-1: print("Time step " + str(time_step) + " epsilon " + str(mainDRQN.get_eps()) + " cum Collison " + str(cum_collision[episode_interval]) + " sum reward " + str(cum_r[episode_interval]) + " total time " + str(time.time()-start_time) ) cum_r = [0] cum_collision = [0] episode += 1 # Updates the velocity of the vehicles if activated env.update_velocity() # ia = env.network.get_information_age(time_step) if train_after_episode and time_step > (batch_size+10) and training: mainDRQN.train(memory, time_step) if time_step%save_freq == save_freq-1: # Save the collisions if save_results: print("save results for timestep ", time_step + 1) save_dir = "save_results/" + "test/" save_dir = save_dir + experiment_name if not os.path.isdir(save_dir): os.makedirs(save_dir) # filename = save_dir + "/collisions" + "_" + str(time_step) +"_sim"+str(simulation) # np.save(filename, np.asarray(cum_collision_slots)) filename = save_dir + "/rewards" + "_sim"+str(simulation) np.save(filename, np.asarray(log_reward_slot)) filename = save_dir + "/actions" + "_sim"+str(simulation) np.save(filename, np.asarray(log_actions_slot)) # filename = save_dir + "/time_step" + "_" + str(time_step)+"_sim"+str(simulation) # np.save(filename, np.asarray(str(time.time()-start_time))) filename = save_dir + "/positions" + "_sim"+str(simulation) np.save(filename, np.asarray(log_x_positions)) #filename = save_dir + "/ia" + "_sim"+str(simulation) #np.save(filename, np.asarray(log_ia_slot)) #"_" + str(time_step)+ if save_model: print("save model for timestep ", time_step + 1) save_dir = "save_model/" + "test/" #save_dir = save_dir mainDRQN.save_model(save_dir, time_step,simulation)