def record(self, time, game_tick_packet): s = EasyGameState(game_tick_packet, self.team, self.index) # trace(s.car.on_ground) trace(s.car.pos[0]) trace(s.car.pos[1]) time = time - self.record_start_time output_vector = ( round(controller.fThrottle), round(controller.fSteer), round(controller.fPitch), round(controller.fYaw), round(controller.fRoll), round(controller.bJump), round(controller.bBoost), round(controller.bHandbrake), ) history_item = historian.HistoryItem( float(time), # game_tick_packet, # output_vector, ) history_item.output_vector = output_vector history_item.game_tick_packet = game_tick_packet self.history.append(history_item) # if self.first_time: # print (history_item.encode()) # self.first_time = False return output_vector
def get_output_vector(self, game_tick_packet): #Comment to switch training mode output = self.reinforced_play(game_tick_packet) # output = self.supervised_play(game_tick_packet) # output = self.obvious_play(game_tick_packet) ##Debuging every 50 steps if self.debug and (self.state['Step']%50)==0: trace(np.mean(self.epi_rewards)) trace(Car(Get_car(game_tick_packet,self.index)).loc.c_2d()) ##Count loops, at MaxStep will reset, also train every maxsteps if self.state['Step'] >= self.state['MaxStep']: #Comment to switch training mode self.reinforced_train() # self.supervised_train() # self.obvious_train() self.state['TrainStep'] += 1 self.state['Step'] = 0 if self.state['TrainStep'] >= self.state['SaveAt']: self.model.save("V3_S_{}".format(self.name)) self.state['TrainStep'] = 0 else: self.state['Step'] += 1 return self.Format_Output(output)
def optimize_model(): if len(memory) < BATCH_SIZE: return episodes = memory.sample(BATCH_SIZE) opt.zero_grad() batch_errors = [] for states, actions, rewards, next_states, dones in episodes: # opt.zero_grad() actions_tensor = torch.tensor(actions, device=device).long().unsqueeze(1) rewards_tensor = np_to_device(rewards) states_tensor = np_to_device(states).view((-1, 1, 4)) # state_values, hidden = model(states_tensor, model.initial_hidden(1)) # print(model(states_tensor, model.initial_hidden(1))[0].shape, actions_tensor.shape) # print(model(states_tensor, model.initial_hidden(1))[0].squeeze(1).shape) state_values = model(states_tensor, model.initial_hidden(1))[0].squeeze(1) state_action_values = state_values.gather(1, actions_tensor) # print(state_action_values) next_state_values = torch.zeros(len(states), device=device) # print(state_values.shape) # print(state_values[1:].max(1)[0].shape) next_state_values[:-1] = state_values[1:].max(1)[0].detach() expected_state_action_values = (next_state_values * GAMMA) + rewards_tensor # import numpy as np # print(np.array([dones, state_action_values.squeeze().detach().cpu().numpy(), rewards_tensor.cpu().numpy(), expected_state_action_values.cpu().numpy()]).T) # print(expected_state_action_values) # print(state_action_values) errors = torch.abs(state_action_values - expected_state_action_values.unsqueeze(1)) batch_errors.append(errors.flatten()) # batch_errors.append(torch.mean(errors).flatten()) # Compute Huber loss # loss = torch.mean(torch.where(errors < 1, 0.5 * errors ** 2, errors - 0.5)) # loss = torch.mean(errors ** 2) # print(loss) # trace(loss.cpu().item()) # loss.backward() # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) # opt.step() # print(batch_errors) # print([_x.shape for _x in batch_errors]) batch_errors = torch.cat(batch_errors) loss = torch.mean(torch.where(batch_errors < 1, 0.5 * batch_errors ** 2, batch_errors - 0.5)) print(loss) trace(loss.cpu().item()) loss.backward() # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) opt.step()
def optimize_model(): if len(memory) < WARMUP: return if len(memory) == WARMUP: print("Memory warmed up") for i in range(5): episodes = memory.sample(BATCH_SIZE) opt.zero_grad() batch_errors = [] for states, actions, rewards, next_states, dones in episodes: actions_tensor = torch.tensor(actions, device=device).long().unsqueeze(1) rewards_tensor = np_to_device(rewards) states_tensor = torch.from_numpy( np.transpose(states, axes=[0, 3, 1, 2])).to(device).float() # print(states_tensor.shape) state_values = model(states_tensor, model.initial_hidden(1))[0].squeeze(1) state_action_values = state_values.gather(1, actions_tensor) # print(state_action_values) next_state_values = torch.zeros(len(states), device=device) next_state_values[:-1] = state_values[1:].max(1)[0].detach() expected_state_action_values = (next_state_values * GAMMA) + rewards_tensor # import numpy as np # print(np.array([dones, state_action_values.squeeze().detach().cpu().numpy(), rewards_tensor.cpu().numpy(), expected_state_action_values.cpu().numpy()]).T) errors = torch.abs(state_action_values - expected_state_action_values.unsqueeze(1)) batch_errors.append(errors.flatten()) batch_errors = torch.cat(batch_errors) loss = torch.mean( torch.where(batch_errors < 1, 0.5 * batch_errors**2, batch_errors - 0.5)) print(loss) trace(loss.cpu().item()) loss.backward() # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) opt.step()
def optimize_model(self, batch_size, gamma, get_tensor_from_obs): for i in range(1): episodes = self.memory.sample(batch_size) self.optimizer.zero_grad() batch_errors = [] for states, actions, rewards, next_states, dones in episodes: actions_tensor = torch.tensor( actions, device=self.device).long().unsqueeze(1) rewards_tensor = self.np_to_device(rewards) states_tensor = get_tensor_from_obs(states) # print(states_tensor.shape) state_values = self.model( states_tensor, self.model.initial_hidden(1))[0].squeeze(1) state_action_values = state_values.gather(1, actions_tensor) # print(state_action_values) target_next_state_values = torch.zeros(len(states), device=self.device) target_state_values = self.target_model( states_tensor, self.model.initial_hidden(1))[0].squeeze(1) target_next_state_values[:-1] = target_state_values[1:].max( 1)[0].detach() expected_state_action_values = (target_next_state_values * gamma) + rewards_tensor # import numpy as np # print(np.array([dones, state_action_values.squeeze().detach().cpu().numpy(), rewards_tensor.cpu().numpy(), expected_state_action_values.cpu().numpy()]).T) errors = torch.abs(state_action_values - expected_state_action_values.unsqueeze(1)) batch_errors.append(errors.flatten()) batch_errors = torch.cat(batch_errors) loss = torch.mean( torch.where(batch_errors < 1, 0.5 * batch_errors**2, batch_errors - 0.5)) print(loss) trace(loss.cpu().item()) loss.backward() # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step()
def train_step(self, formatted_input, formatted_output, rewards=None, batch_size=1): self.optimizer.zero_grad() formatted_input = [ self.torch.from_numpy(x).float() for x in formatted_input ] formatted_output = self.torch.from_numpy(formatted_output).float() network_output = self.actor_model.forward(*formatted_input) loss = self.loss_function(network_output, formatted_output) loss.backward() # for i in range(9): # trace(self.loss_function(network_output[:, i], formatted_output[:, i]).item(), key=i) trace(loss.item(), key='loss') self.optimizer.step()
def episode(self): self.simulation.random_state() reward = torch.zeros((self.simulation.o.shape[0], steps), device=device) for i in range(steps): self.simulation.step(delta_time) reward[:, i] = self.simulation.error().neg() + rotation_eps trace((reward > 0).float().sum(1).mean(0).item(), reset_on_parent_change=False, key='frames done') reward[:, steps - 1] = self.andt(reward[:, steps - 1]) for i in reversed(range(steps - 1)): reward[:, i] = self.andt(reward[:, i], reward[:, i + 1]) loss = reward.sum(1).mean(0).neg() # if average_reward.item() > self.max_reward: # self.max_reward = average_reward.item() # torch.save(self.policy.state_dict(), f'{model_name}_{round(self.max_reward, 1)}.mdl') # torch.save(self.optimizer.state_dict(), f'{model_name}_{round(self.max_reward, 1)}.state') self.optimizer.zero_grad() loss.backward() self.optimizer.step() trace(loss.item(), reset_on_parent_change=False, key='loss') trace((reward < 0).sum(1).float().mean(0).item(), reset_on_parent_change=False, key='frame weight')
def main(): # Trace some dummy data for i in range(120*60): # Demo: Simple and common usecase trace(30 * math.sin(i/30)) # Demo: Vector display and having multiple displays in one view_box t = i / 100 t += math.sin(t) trace([math.cos(t) * 2, math.cos(20*t)], view_box='Wave-like things') trace([math.cos(t) + 5, math.sin(t) ], view_box='Wave-like things') # Custom display trace(almost_fizzbuzz(i), custom_display=StringCounter) time.sleep(1/60.) # Simulate an external main loop
def get_output_vector(self, game_tick_packet): s = EasyGameState(game_tick_packet, self.team, self.index) speed = mag(s.car_vel) turn_rate = game_tick_packet.gamecars[ self.index].AngularVelocity.Z # rad/s turn_radius = speed / max(turn_rate, 0.001) if self.start_time is None: self.start_time = s.time time_elapsed = s.time - self.start_time desired_speed = 10 + time_elapsed * 100 too_slow = desired_speed > speed should_boost = desired_speed > 1000 and too_slow pedal = too_slow if desired_speed < 500: pedal *= 0.5 trace(speed) # trace(turn_rate) trace(turn_radius) # trace(desired_speed) trace(turn_radius - estimate_turn_radius(speed)) output_vector = [ pedal, # fThrottle 1, # fSteer 0, # fPitch 0, # fYaw 0, # fRoll 0, # bJump should_boost, # bBoost 0, # bHandbrake ] if not controller.hat_toggle_west: if self.measurements: print('TADA:') print(repr(self.measurements)) output_vector = ( round(controller.fThrottle), round(controller.fSteer), round(controller.fPitch), round(controller.fYaw), round(controller.fRoll), round(controller.bJump), round(controller.bBoost), round(controller.bHandbrake), ) self.start_time = None self.measurements = [] else: # self.start_time = s.time self.measurements.append((desired_speed, turn_radius)) return sanitize_output_vector(output_vector)
def episode(self): self.simulation.random_state() reward = torch.zeros((self.simulation.o.shape[0], ), device=device) framesDone = torch.zeros((self.simulation.o.shape[0], ), device=device) # profile() # sys.exit() for i in range(steps): self.simulation.step(delta_time) diff = rotation_eps - self.simulation.error() # reward *= 0.8 # reward += diff.clamp(max=0) reward += diff.clamp(max=0, min=-rotation_eps / 2 if self.reachesEnd else None) finished = (diff > 0).float() # reward = diff.clamp(max=0) framesDone += 1 framesDone *= finished # reward = finished * (reward + 1) # if i == steps-1: # framesDone = reward.clone().detach() # reward += diff.clamp(max=0) / rotation_eps # reward = framesDone trace(((steps - framesDone) * delta_time * 120).mean(0).item(), reset_on_parent_change=False, key='game frames to destination') failed = (framesDone == 0).float().mean(0).item() self.reachesEnd = failed < 0.2 trace(failed, reset_on_parent_change=False, key='amount failed') # reward[:, steps - 1] = self.andt(reward[:, steps - 1]) # for i in reversed(range(steps - 1)): # reward[:, i] = self.andt(reward[:, i], reward[:, i+1]) loss = reward.mean(0).neg() # average_reward = sum(reward[:, steps - 1]) / len(reward[:, steps - 1]) # if average_reward.item() > self.max_reward: # self.max_reward = average_reward.item() # torch.save(self.policy.state_dict(), f'out/{model_name}_{round(self.max_reward, 1)}.mdl') # torch.save(self.optimizer.state_dict(), f'out/{model_name}_{round(self.max_reward, 1)}.state') self.optimizer.zero_grad() loss.backward() # spits out error self.optimizer.step() trace(loss.item(), reset_on_parent_change=False, key='loss')
def main(): import math import time from quicktracer import trace # Demo: Trace some dummy data for i in range(40 * 60): # Simple and common usecase trace(30 * math.sin(i / 30), view_box="view1") trace(30 * math.cos(i / 30), view_box="view1") # Vectors are supported t = i / 100 t += math.sin(t) trace([math.cos(t), math.cos(30 * t)]) # Custom display trace(almost_fizzbuzz(i), custom_display=StringCounter) time.sleep(1 / 60.) # Simulate an external main loop
def select_action(state, hidden): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY) trace(eps_threshold) trace(steps_done) steps_done += 1 # print(state.shape, "oipetreioreuio") out, hidden = agent.model(get_tensor_from_obs(state), hidden) if sample > eps_threshold: with torch.no_grad(): action = out.max(2)[1].item() trace(action) else: action = random.getrandbits(2) return action, hidden
def select_action(state, hidden): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) trace(eps_threshold) trace(steps_done) steps_done += 1 out, hidden = model(np_to_device(state).view((1, 1, -1)), hidden) if sample > eps_threshold: with torch.no_grad(): # t.max(0) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. print(out) action = out.max(2)[1].item() trace(action) else: action = random.getrandbits(1) return action, hidden
episode_rewards = [] for i_episode in range(num_episodes): # Initialize the environment and state state = get_obs(env.reset()) step = 0 episode_reward = 0 hidden = model.initial_hidden(1) while True: # env.render() action, hidden = select_action(state, hidden) next_state, reward, done, _ = env.step(action) reward = reward if not done else -1 used_action = action trace(used_action) next_state = get_obs(next_state) trace(reward) memory.add(state, action, reward, done) state = next_state episode_reward += reward step += 1 if done: episode_rewards.append(episode_reward) print( f"Ep: {len(episode_rewards): 3d}, \tstep: {step: 4d}, \treward: {int(episode_reward): 2d}, \taverage_reward: {np.mean(episode_rewards[-20:]):.2f}" )
def _trace_logs(self, logs): for metric in self.params['metrics']: if metric in logs: trace(float(logs[metric]), key=metric)
# Initialize the environment and state state = env.reset() step = 0 hidden = model.initial_hidden(1) while True: # env.render() # Select and perform an action action, hidden = select_action(state, hidden) # print(f"a: {action}") next_state, reward, done, _ = env.step(action) # print(reward, done) # reward = 0 if done else reward # Store the transition in memory memory.add(state, action, reward, done) # Move to the next state state = next_state if done: print(step) episode_duration = step trace(episode_duration) break step += 1 # Perform one step of the optimization optimize_model() # Update the target network, copying all weights and biases in DQN # if i_episode % TARGET_UPDATE == 0: # target_net.load_state_dict(policy_net.state_dict())
state = get_obs(env.reset()) step = 0 while True: env.render() # Select and perform an action action = select_action(state) # print(f"a: {action}") next_state, reward, done, _ = env.step(action) next_state = get_obs(next_state) # print(reward, done) # reward = 0 if done else reward # Store the transition in memory memory.add(state, action, reward, next_state, done) # Move to the next state state = next_state # Perform one step of the optimization optimize_model() if done: # episode_durations.append(t + 1) # plot_durations() print(step) trace(step) break step += 1 # Update the target network, copying all weights and biases in DQN # if i_episode % TARGET_UPDATE == 0: # target_net.load_state_dict(policy_net.state_dict())