def load_models(path, training=False, device="cpu"): net = dqn_model.DQN(18, 360) if os.path.exists(path): print("DOSYA BULUNDU. AĞ AĞIRLIKLARI DOSYADAN ÇEKİLİYOR...") net.load_state_dict(torch.load(path, map_location=torch.device("cpu"))) else: print("DOSYA BULUNAMADI. AĞ RASTGELE AĞIRLIKLARLA BAŞLATILIYOR...") if training: return net.to(device), dqn_model.DQN(18, 360).to(device) return net.to(device)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--cuda', default=False, action='store_true', help='Enable CUDA') parser.add_argument('--env', default=DEFAULT_ENV_NAME) parser.add_argument('--reward', type=float, default=MEAN_REWARD_BOUND, \ help='Mean reward bound for stop of training') args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = wrappers.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) target_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(comment='-' + args.env) print(net) buffer = ReplayBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None
def main(): # Initialize the node rospy.init_node('Learning_Node', anonymous=True) device = torch.device("cpu") # Alternatively can be used "cuda" env = gym.make(ENV_NAME) # We use two networks synchronised each to avoid to prevent updating # the weights of one state from too much influence the weights of neighboring states, # making learning unstable net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) net = net.float() tgt_net = tgt_net.float() # Tensorboard writer writer = SummaryWriter(comment="__" + ENV_NAME + "__") # Agent initialization replay_buffer = Exp_Replay_Buffer(EXP_REPLAY_BUFFER_SIZE) agent = DQN_Agent(env, replay_buffer) # Other initialization optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) epsilon = EPSILON_START total_rewards = [] iterations = 0 best_mean_reward = None # ROS main loop while not rospy.is_shutdown(): # Epsilon decay iterations += 1 if epsilon <= EPSILON_FINAL: epsilon = EPSILON_FINAL else: epsilon = EPSILON_START - iterations / EPSILON_DECAY_STEPS # Step reward = agent.play_step(net, epsilon, device=device) # Store informations if reward is not None: total_rewards.append(reward) mean_reward = np.mean(total_rewards[-100:]) print("[%d] - done %d games, reward_mean %.3f, " "eps %.2f" % (iterations, len(total_rewards), mean_reward, epsilon)) writer.add_scalar("epsilon", epsilon, iterations) writer.add_scalar("reward_100", mean_reward, iterations) writer.add_scalar("reward", reward, iterations) if best_mean_reward is None or best_mean_reward < mean_reward: # Uncommenting, we save net weights in a PATH's file # torch.save(net.state_dict(), PATH) if best_mean_reward is not None: print("Best reward updated %.3f -> %.3f" % (best_mean_reward, mean_reward)) best_mean_reward = mean_reward if mean_reward > MEAN_REWARD_BOUND: print("Solved in %d iterations!" % iterations) break # Fill a little the buffer before start optimization if len(replay_buffer) < SAMPLE_START_STEPS: continue # Periodic synchronization of nets if iterations % SYNC_NETS_STEPS == 0: tgt_net.load_state_dict(net.state_dict()) # Optimization optimizer.zero_grad() batch = replay_buffer.sample(BATCH_SIZE) error = calc_loss(batch, net, tgt_net, device=device) writer.add_scalar("loss", error, iterations) error.backward() optimizer.step() writer.close()
from collections import Counter import time import wrapper import dqn_model ### Play the pong game with a trained dqn agent LOAD_PATH = './models/pong/400_pong_policy_net.pt' RENDER = True FPS = 25 ## for playing first we initialize the env env = wrapper.make_env("PongNoFrameskip-v4") ## initialize a model policy_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).eval() ## load the trained model #print(torch.load(LOAD_PATH)) policy_net.load_state_dict(torch.load(LOAD_PATH)) ## get the initial state state = env.reset() state = torch.FloatTensor(state).unsqueeze(0) total_reward = 0.0 action_count = Counter() ## play the game #for i in range(10): while True: ## get start time start_ts = time.time()
if __name__ == "__main__": # set the device -> GPU or CPU device = "cuda" # create the wrapped environment env = SuperTuxKart(spf=0.000887, framebuffer_size=FRAME_BUFFER_SIZE, width=SCREEN_WIDTH, height=SCREEN_HEIGHT, level=GAME_LEVEL, mode=GAME_MODE, speedup=GAME_SPEED, observe_performance=OBS_PERFORMANCE, performance_window_size=WINDOW_ACTIONS, performance_window_overlap=OVERLAP_ACTIONS, laps=LAPS) # ********************************************************************************************************************** # * DEFINE THRESHOLD * # ********************************************************************************************************************** if NUM_PREGAMES > 0: print('******************************* STARTING PRE GAMES TO SET THE THRESHOLD *******************************') # create the net and the target net net = dqn_model.DQN((FRAME_BUFFER_SIZE, SCREEN_WIDTH, SCREEN_HEIGHT), len(POSSIBLE_ACTIONS)).to(device) tgt_net = dqn_model.DQN((FRAME_BUFFER_SIZE, SCREEN_WIDTH, SCREEN_HEIGHT), len(POSSIBLE_ACTIONS)).to(device) net.load_state_dict(torch.load(DEFAULT_ENV_NAME + "-best_RL-baseline.dat")) tgt_net.load_state_dict(torch.load(DEFAULT_ENV_NAME + "-best_RL-baseline.dat")) print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START threshold = THRESHOLD optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] time_consumption_games = [] saved_game_actions = []
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', required=True, help='Model file to load') parser.add_argument('-e', '--env', default=DEFAULT_ENV_NAME) parser.add_argument('-r', '--record', help='Dir to store video recording') parser.add_argument('-nv', '--no-visualize', default=True, \ action='store_false', dest='visualize', help='Disable visualization of game play') args = parser.parse_args() env = wrappers.make_env(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) net.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) s = env.reset() total_rewards = 0. cnt = collections.Counter() while True: tic = time.time() if args.visualize: env.render() state_v = torch.tensor(np.array([s], copy=False)) Qs = net(state_v).data.numpy()[0] a = np.argmax(Qs) cnt[a] += 1
def convert_to_gray(color_state): return cv2.cvtColor(color_state, cv2.COLOR_BGR2GRAY) / 255.0 ## Main train loop #env = gym.make('Pong-v0') #env = wrapper.make_env("PongNoFrameskip-v4") #print(env.observation_space.shape) ## init a replay buffer memory_buffer = ReplayBuffer(REPLAY_SIZE) ## init the DQN networks (both policy and target nets) input_shape = [STACK_SIZE, 48, 48] #print(input_shape) policy_net = dqn_model.DQN(input_shape, env.action_space.n).to(device) target_net = dqn_model.DQN(input_shape, env.action_space.n).to(device) ## copy the policy net weights to target net target_net.load_state_dict(policy_net.state_dict()) ## define the optimizer optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE) ## init an agent agent_dqn = Agent(policy_net, env) ## reward vars ten_rewards = 0.0 ## reward for 10 episodes total_rewards = list() ## total best_mean_reward = None update_counter = 0
def __init__(self, args: argparse.Namespace, cuda=use_cuda, action_repeat: int = 4): print("add is", GLOBALACTIVATEADDITIONAL) # Init #todo: move these to the argument list self.folderName = GLOBALFOLDERNAME #"dqn_checkpoints6_normal" print("Startig with experiment ", self.folderName) self.activateAdditional = GLOBALACTIVATEADDITIONAL #True self.clip: bool = args.clip self.seed: int = args.seed self.action_repeat: int = action_repeat self.frame_skipping: int = args.skip_action self._state_buffer = deque(maxlen=self.action_repeat) self._additional_state_buffer = deque(maxlen=self.action_repeat) self.step = 0 self.best_score = args.best or -10000 self.best_count = 0 self.starttime = time.time() self.lasttime = self.starttime self.maxIter = args.maxIter self._play_steps = deque(maxlen=5) # Environment self.env = dqnenv.Environment( args.game, record=args.record, seed=self.seed, activateAdditional=self.activateAdditional, videoFolder=self.folderName) self.env.n_action = 4 #self.env.action_space.n_ # DQN Model self.dqn_hidden_state = self.dqn_cell_state = None self.target_hidden_state = self.target_cell_state = None self.mode: str = args.model.lower() if self.mode == 'dqn': self.dqn: DQN = dqnmodel.DQN(self.env.action_space, self.activateAdditional) elif self.mode == 'smaller': self.dqn: DQN = dqnmodel.DQN_smaller(self.env.action_space, self.activateAdditional) elif self.mode == 'verysmall': self.dqn: DQN = dqnmodel.DQN_verysmall(self.env.action_space, self.activateAdditional) elif self.mode == 'tiny': self.dqn: DQN = dqnmodel.DQN_tiny(self.env.action_space, self.activateAdditional) elif self.mode == 'other': self.dqn: DQN = dqnmodel.DQN_other(self.env.action_space, self.activateAdditional) elif self.mode == 'big': self.dqn: DQN = dqnmodel.DQN_big(self.env.action_space, self.activateAdditional) elif self.mode == 'big2': self.dqn: DQN = dqnmodel.DQN_big2(self.env.action_space, self.activateAdditional) elif self.mode == 'connected': self.dqn: DQN = dqnmodel.DQN_connected(self.env.action_space, self.activateAdditional) elif self.mode == 'test1': self.dqn: DQN = dqnmodel.DQN_test1(self.env.action_space, self.activateAdditional) elif self.mode == 'test2': self.dqn: DQN = dqnmodel.DQN_test2(self.env.action_space, self.activateAdditional) elif self.mode == 'insp': self.dqn: DQN = dqnmodel.DQN_insp(self.env.action_space, self.activateAdditional) elif self.mode == 'fully': self.dqn: DQN = dqnmodel.DQN_fully(self.env.action_space, self.activateAdditional) elif self.mode == 'frameskip': self.dqn: DQN = dqnmodel.DQN_frameskip(self.env.action_space, self.activateAdditional) else: print("Error: didn't understood modelname") # self.dqn: DQN = DQN(self.env.action_space,self.activateAdditional) if cuda: self.dqn.cuda() # DQN Target Model self.target: DQN = copy.deepcopy(self.dqn) # Optimizer self.optimizer = optim.Adam(self.dqn.parameters(), lr=LEARNING_RATE) # Replay Memory self.replay = rpl.ReplayMemory() # Epsilon self.epsilon = EPSILON_START