def __init__(self, env, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=25, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model(in_features=2, hidden=[self.state_len, self.state_len], out_features=len(Agent.actions)) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size
def __init__(self, env, model, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.model = model self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size
def __init__(self, inputs, n_actions): self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain.load_state_dict(self.brain.state_dict()) self.target_brain.eval() self.set_params() self.optimizer = torch.optim.Adam(self.brain.parameters()) self.memory = ReplayMemory(50000) self.action_space = [0, 1]
def __init__(self, env, input_size, output_size, hidden_size, mix_hidden = 32, batch_size = 128, lr = 0.001, gamma = .999, eps_start = 0.9, eps_end = 0.05, eps_decay = 750, replay_capacity = 10000, num_save = 200, num_episodes = 10000, mode="random", training = False, load_file = None): self.env = env self.orig_env = copy.deepcopy(env) self.grid_map = env.grid_map self.cars = env.grid_map.cars self.num_cars = len(self.cars) self.passengers = env.grid_map.passengers self.num_passengers = len(self.passengers) self.input_size = input_size self.output_size = output_size self.hidden_size = hidden_size self.batch_size = batch_size self.gamma = gamma self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.replay_capacity = replay_capacity self.num_episodes = num_episodes self.steps_done = 0 self.lr = lr self.mode = mode self.num_save = num_save self.training = training self.algorithm = PairAlgorithm() self.episode_durations = [] self.loss_history = [] self.memory = ReplayMemory(self.replay_capacity) self.device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else print("Device being used:", self.device) self.policy_net = DQN(self.input_size, self.output_size , self.hidden_size).to(self.device) self.params = list(self.policy_net.parameters()) if self.mode == "qmix": self.mixer = QMixer(self.input_size, self.num_passengers, mix_hidden).to(self.device) self.params += list(self.mixer.parameters()) if load_file: self.policy_net.load_state_dict(torch.load(load_file)) self.policy_net.eval() if self.mode == "qmix": self.mixer.load_state_dict(torch.load("mixer_" + load_file)) self.mixer.eval() self.load_file = "Trained_" + load_file print("Checkpoint loaded") else: self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \ "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth" self.optimizer = optim.RMSprop(self.params, lr = self.lr)
def __init__(self): # self.config = config self.gamma = 0.4 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 1700 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: # print_cuda_statistics() self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = "/home/sk002/Desktop/model/"
def testReplayMemory(self): od = [84, 84, 4] ad = [8, 10] rd = [5] s = int(10000) b = 32 rm = ReplayMemory(obs_dim=od, act_dim=ad, r_dim=rd, size=s) o = self.get_rand(od) a = self.get_rand(ad) r = self.get_rand(rd) d = 0 for _ in range(1000): rm.store(o, a, r, o, d) o_s, a_s, r_s, on_s, d_s = rm.sample(b) self.assertEqual(o_s.shape, combined_shape(b, od)) self.assertEqual(a_s.shape, combined_shape(b, ad)) self.assertEqual(r_s.shape, combined_shape(b, rd)) self.assertEqual(on_s.shape, combined_shape(b, od)) self.assertEqual(d_s.shape, combined_shape(b))
def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost q = (1.0 - p) / 2 self.stochastic_actions = { '←': [[0, 2, 3], [p, q, q]], '→': [[1, 2, 3], [p, q, q]], '↑': [[2, 0, 1], [p, q, q]], '↓': [[3, 0, 1], [p, q, q]] } self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn.load_state_dict(self.nn.state_dict()) self.target_nn.eval() self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.target_update = target_update
def __init__(self, name, others=None, last_n=10, load_path=None, checkpoint=5000, fixed_strategy=False, eps_decay=0.00005): if others is None: others = [1, 2] self.others = others self.last_n = last_n self.prev_points = 0 self.batch_size = 32 self.gamma = 0.9 self.eps_start = 1 self.eps_end = 0.01 self.eps_decay = eps_decay self.target_update = 100 self.plot_at = 1000 self.q_max = [] self.q_list = [] self.checkpoint = checkpoint self.memory_size = 1000 self.lr = 0.00001 self.train = True self.input_dim = len(others) * 6 self.output_dim = 3 self.current_step = 1 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.memory = ReplayMemory(self.memory_size) # Initialize the policy and target networks self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if load_path is not None: checkpoint = torch.load(load_path) self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.policy_net.eval() self.eps_start = 0 self.eps_end = 0 self.train = False if fixed_strategy: self.strategy = FixedStrategy() self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end, self.eps_decay) # Set the optimizer self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) self.loss = None # Push to replay memory self.prev_state = None self.action = None self.reward = None self.current_state = None super().__init__(name)
# Initialize environment and config. env = gym.make(args.env) env_config = ENV_CONFIGS[args.env] env = gym.wrappers.AtariPreprocessing(env, screen_size=84, grayscale_obs=True, frame_skip=1, noop_max=30, scale_obs=True) # Initialize deep Q-networks. dqn = DQN(env_config=env_config).to(device) # TODO: Create and initialize target Q-network. target_dqn = DQN(env_config=env_config).to(device) # Create replay memory. memory = ReplayMemory(env_config['memory_size']) # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp. optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr']) # Keep track of best evaluation mean return achieved so far. best_mean_return = -float("Inf") for episode in range(env_config['n_episodes']): done = False obs = preprocess(env.reset(), envID=args.env, env=env).unsqueeze(0) obs_stack = torch.cat(env_config['obs_stack_size'] * [obs]).unsqueeze(0).to(device) count = 0 while not done: # TODO: Get action from DQN. action = dqn.act(obs_stack)
def __init__(self): # self.config = config self.gamma = 0.75 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.0001) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 250 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = os.path.join(os.getcwd(), "model") + "/" if not os.path.isdir(self.savepath): os.makedirs(self.savepath) t = time.localtime() self.save_tensorboard_path = os.path.join( os.getcwd(), "tensorboard_record") + "/run_" + time.strftime( "%d_%m_%Y_%H_%M", t) + "/" if not os.path.isdir(self.savepath): os.makedirs(self.savepath) self.writer = SummaryWriter(self.save_tensorboard_path)
dtype=torch.long) if __name__ == "__main__": BATCH_SIZE = 128 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 MAX_T = 9999 steps_done = 0 timer = Timer() rect = util.get_screen_rect() region = (rect[0], rect[1], rect[2] - rect[0], rect[3] - rect[1]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") monitor = Monitor(device, region) env = gym.make("Game-v0") init_screen = monitor.get_screen(pytorch=True) _, _, height, width = init_screen.shape n_actions = env.action_space.n policy_net = DQN(width, height, n_actions).to(device) target_net = DQN(width, height, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = torch.optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(3000) simulate()
ACTION_BUILD_BARRACKS, ACTION_ATTACK, ACTION_SELECT_BARRACKS, ACTION_BUILD_MARINE, ] KILL_UNIT_REWARD = 0.2 KILL_BUILDING_REWARD = 0.5 reward_check = [] model = DQN(6, 8) optimizer = optim.RMSprop(model.parameters(), 1e-3) memory = ReplayMemory(10000) class DQNAgent(base_agent.BaseAgent): def __init__(self): super(DQNAgent, self).__init__() self.previous_state = None self.previous_action = None self.model = model self.memory = memory self.optimizer = optimizer self.diagnostics = [0, 0, 0, 0, 0, 0, 0, 0] self.base_top_left = None self.supply_depot_built = False self.scv_selected = False
def train(args): device = torch.device("cuda" if args.gpu else "cpu") env = Environment(draw=False, fps=args.fps, debug=args.debug, dist_to_pipe=args.dist_to_pipe, dist_between_pipes=args.dist_between_pipes, obs_this_pipe=args.obs_this_pipe) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() policy_network = DQN(observation_space, action_space).to(device) target_network = DQN(observation_space, action_space).to(device) optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr) replay_buffer = ReplayMemory(args.replay_capacity) writer = SummaryWriter() if args.inference: target_network.load_checkpoint() best_reward = None iteration = 0 total_reward = 0.0 rewards = [] state = env.reset() while True: epsilon = max(args.final_eps, args.start_eps - iteration / args.eps_decay_final_step) iteration += 1 episode_reward = None if np.random.rand() < epsilon: action = env.get_action_random() else: state_v = torch.tensor(np.array([state], copy=False)).to(device) q_vals_v = policy_network(state_v.float()) _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) next_state, reward, done = env.step(action) total_reward += reward replay_buffer.push(state, action, next_state, reward, done) state = next_state if done: episode_reward = total_reward state = env.reset() total_reward = 0.0 if episode_reward is not None: rewards.append(episode_reward) mean_reward = np.mean(rewards[-80:]) print( f"Episode {iteration}: eps {epsilon} mean reward {mean_reward} episode reward {episode_reward}" ) writer.add_scalar("epsilon", epsilon, iteration) writer.add_scalar("mean_reward", mean_reward, iteration) writer.add_scalar("reward", episode_reward, iteration) if best_reward is None or best_reward < mean_reward: torch.save(policy_network.state_dict(), f"./models/checkpoint_{iteration}") print(f"New best reward found: {best_reward} -> {mean_reward}") best_reward = mean_reward if mean_reward > args.goal_reward: print(f"Achieved in {iteration} steps.") break if len(replay_buffer) < args.replay_start_step: continue if iteration % args.target_update_iterations == 0: target_network.load_state_dict(policy_network.state_dict()) optimizer.zero_grad() batch = replay_buffer.sample(args.batch_size) loss = calculate_loss(batch, policy_network, target_network, args.gamma, device=device) loss.backward() optimizer.step() writer.close()