def init(self, info): self.field = info['field'] self.max_linear_velocity = info['max_linear_velocity'] self.goal = info['goal'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self._frame = 0 self.wheels = [0 for _ in range(25)] self.cur_posture = [] self.prev_posture = [] self.cur_posture_opp = [] self.cur_ball = [] self.prev_ball = [] self.set_default_formation() self.previous_frame = Frame() self.frame_skip = 2 # number of frames to skip self.obs_size = 3 # state size self.act_size = 7 # number of discrete actions # for RL self.action = 0 self.previous_action = 0 self.num_inputs = self.obs_size # RL algorithm class self.load = True self.play = True self.trainer = DQN(self.number_of_robots, self.obs_size, self.act_size, self.load, self.play) helper.printConsole("Initializing variables...")
def init(self, info): self.field = info['field'] self.max_linear_velocity = info['max_linear_velocity'] self.robot_size = info['robot_size'][0] self.goal = info['goal'] self.penalty_area = info['penalty_area'] self.goal_area = info['goal_area'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self._frame = 0 self.speeds = [0 for _ in range(30)] self.cur_posture = [] self.cur_posture_opp = [] self.cur_ball = [] self.previous_ball = [] self.previous_posture = [] self.previous_posture_opp = [] self.predicted_ball = [] self.idx = 0 self.idx_opp = 0 self.previous_frame = Frame() self.defense_angle = 0 self.attack_angle = 0 self.team = TeamK(self.field, self.goal, self.penalty_area, self.goal_area, self.robot_size, self.max_linear_velocity) helper.printConsole("Initializing variables...")
def print_loss(self, loss, iteration): if self._iterations % 100 == 0: # Print information every 100 iterations helper.printConsole( "======================================================") helper.printConsole("Epsilon: " + str(self.epsilon)) helper.printConsole("iterations: " + str(self._iterations)) helper.printConsole("Loss: " + str(loss)) helper.printConsole( "======================================================")
def init(self, info): self.field = info['field'] self.max_linear_velocity = info['max_linear_velocity'] self.robot_size = info['robot_size'][0] self.goal = info['goal'] self.penalty_area = info['penalty_area'] self.goal_area = info['goal_area'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self._frame = 0 self.speeds = [0 for _ in range(30)] self.cur_posture = [] self.cur_posture_opp = [] self.cur_ball = [] self.previous_ball = [] self.predicted_ball = [] self.idx = 0 self.idx_opp = 0 self.previous_frame = Frame() self.defense_angle = 0 self.attack_angle = 0 self.gk_index = 0 self.d1_index = 1 self.d2_index = 2 self.f1_index = 3 self.f2_index = 4 self.GK = Goalkeeper(self.field, self.goal, self.penalty_area, self.goal_area, self.robot_size, self.max_linear_velocity) self.D1 = Defender_1(self.field, self.goal, self.penalty_area, self.goal_area, self.robot_size, self.max_linear_velocity) self.D2 = Defender_2(self.field, self.goal, self.penalty_area, self.goal_area, self.robot_size, self.max_linear_velocity) self.F1 = Forward_1(self.field, self.goal, self.penalty_area, self.goal_area, self.robot_size, self.max_linear_velocity) self.F2 = Forward_2(self.field, self.goal, self.penalty_area, self.goal_area, self.robot_size, self.max_linear_velocity) helper.printConsole("Initializing variables...")
def init(self, info): self.field = info['field'] self.max_linear_velocity = info['max_linear_velocity'] self.goal = info['goal'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self._frame = 0 self.speeds = [0 for _ in range(20)] self.cur_posture = [] self.prev_posture = [] self.cur_posture_opp = [] self.cur_ball = [] self.prev_ball = [] self.set_default_formation() self.previous_frame = Frame() self.frame_skip = 2 # number of frames to skip self.obs_size = 3 # state size self.act_size = 7 # number of discrete actions # for RL self.number_of_agents = 1 # in this example, just F2 is trained self.action = 0 self.previous_action = 0 self.state = [] self.previous_state = [] self.reward = 0 self.previous_reward = 0 # RL algorithm class self.trainer = DQN(self.number_of_agents, self.obs_size, self.act_size) # log rewards self.total_reward = 0 self.rew = np.zeros(4) # for logging rewards self.t = 0 self.episode = 1 self.plot_reward = Logger() self.save_png_interval = 10 helper.printConsole("Initializing variables...")
def __init__(self, n_agents, dim_obs, dim_act, load=False, play=False): self.n_agents = n_agents self._iterations = 0 self.update_steps = 100 # Update Target Network self.epsilon_steps = 20000 # Decrease epsilon self.play = play if self.play == True: self.epsilon = 0 # Greedy choice if play is True else: self.epsilon = 0.95 # Initial epsilon value self.final_epsilon = 0.05 # Final epsilon value self.dec_epsilon = 0.025 # Decrease rate of epsilon for every generation self.observation_steps = 300 # Number of iterations to observe before training every generation self.save_num = 10000 # Save checkpoint # default: 100 self.batch_size = 64 self.discount_factor = 0.99 self.num_inputs = dim_obs self.act_size = dim_act self.memory = Memory(50000) # replay buffer self.net = Agent(self.num_inputs, self.act_size) self.target_net = Agent(self.num_inputs, self.act_size) self.load = load self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if self.load == True: self.net.load_state_dict( torch.load(CHECKPOINT, map_location=torch.device(self.device))) helper.printConsole("loading variables...") update_target_model(self.net, self.target_net) self.net.train() self.target_net.train() self.net.to(self.device) self.target_net.to(self.device) self.gamma = 0.99 self.grad_norm_clip = 10 self.loss = 0 self.lr = 0.0001 self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
def update_policy(self): batch = self.memory.sample(self.batch_size) states = torch.Tensor(batch.state).to(self.device) next_states = torch.Tensor(batch.next_state).to(self.device) actions = torch.Tensor(batch.action).long().to(self.device) rewards = torch.Tensor(batch.reward).to(self.device) q_values = self.net(states).squeeze(1) max_next_q_values = self.target_net(next_states).squeeze(1).max(1)[0] one_hot_action = torch.zeros(self.batch_size, q_values.size(-1)).to(self.device) one_hot_action.scatter_(1, actions.unsqueeze(1), 1) chosen_q_values = torch.sum(q_values.mul(one_hot_action), dim=1) target = rewards + self.discount_factor * max_next_q_values td_error = (chosen_q_values - target.detach()) loss = (td_error**2).sum() self.loss = loss.cpu().data.numpy() self.optimizer.zero_grad() loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_norm_clip) self.optimizer.step() if self._iterations % self.update_steps == 0: update_target_model(self.net, self.target_net) helper.printConsole("Updated target model.") if self._iterations % self.epsilon_steps == 0: self.epsilon = max(self.epsilon - self.dec_epsilon, self.final_epsilon) helper.printConsole("New Episode! New Epsilon:" + str(self.epsilon)) return self.loss
def save_checkpoint(self, iteration): if iteration % self.save_num == 0: self.net.save_model(self.net, CHECKPOINT) helper.printConsole("Saved Checkpoint.")