def __init__(self, state_size, action_size, seed, alpha, gamma, tau): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.alpha = alpha self.gamma = gamma self.tau = tau # Q Learning Network self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.alpha) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.env = env self.buffer = ReplayBuffer() self.num_action = self.env.get_action_space().n self.cur_step = 0 self.greedyPolicy = EpsilonGreedyStrategy(1, 0.025, 0.01) self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.num_episode = args.num_episode self.learning_rate = args.learning_rate self.sample_batch_size = args.sample_batch_size self.gamma = args.gamma self.e = 1 if args.test_dqn: # you can load your model here print('loading trained model')
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.env = env self.batch_size = BATCH_SIZE self.gamma = 0.999 self.eps_start = EPS_START self.eps_decay = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.policy_net = DQN(self.env.action_space.n) self.target_net = DQN(self.env.action_space.n) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if use_cuda: self.policy_net.cuda() self.target_net.cuda() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5) self.memory = deque(maxlen=10000) if args.test_dqn: # you can load your model here print('loading trained model')
def __init__(self, env, args): # Parameters for q-learning super(Agent_DQN, self).__init__() self.env = env state = env.reset() state = state.transpose(2, 0, 1) self.policy_net = DQN(state.shape, self.env.action_space.n) # Behavior Q self.target_net = DQN(state.shape, self.env.action_space.n) # Target Q self.target_net.load_state_dict(self.policy_net.state_dict()) #Initial Q if USE_CUDA: print("Using CUDA . . . ") self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() print('hyperparameters and network initialized') if args.test_dqn or LOAD == True: print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.target_net.load_state_dict(self.policy_net.state_dict())
def __init__(self, params, player_n=0): # unpack the parameters: #### simulation self.device = params["device"] self.env_name = params["env_name"] self.training_frames = params["training_frames"] self.skip_frames = params["skip_frames"] self.nactions = params["nactions"] self.messages_enabled = params["messages_enabled"] self.selfplay = params["selfplay"] #### qnet model self.learning_rate = params["learning_rate"] self.sync = params["sync"] self.load_from = params["load_from"] #### buffer self.batch_size = params["batch_size"] self.replay_size = params["replay_size"] self.nstep = params["nstep"] #### agent model self.gamma = params["gamma"] self.eps_start = params["eps_start"] self.eps_end = params["eps_end"] self.eps_decay_rate = params["eps_decay_rate"] self.player_n = player_n self.double = params["double"] # initialize the simulation with shared properties self.env = gym.make( self.env_name ) # environment, agent etc. can"t be created jointly in a server simulation self.net = DQN(self.env.observation_space.shape[0], self.nactions**2).to(self.device)
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy """ super(Agent_DQN, self).__init__(env) ########################### # initializations for replay memory self.env = env self.buffer = collections.deque( maxlen=REPLAY_SIZE) # initializing a replay memory buffer #initializations of agent self._reset() self.last_action = 0 self.net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE) self.target_net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE) LOAD_MODEL = True if args.test_dqn: #you can load your model here print('preparing to load trained model') ########################### LOAD_MODEL = True if LOAD_MODEL: self.net.load_state_dict( torch.load(MODEL, map_location=lambda storage, loc: storage)) print('loaded trained model') self.target_net.load_state_dict(self.net.state_dict())
def main(): config = Config() config.mode = 'test' config.dropout = 1.0 model = DQN(config) model.initialize() rewards = evaluate_policy(model, config.T, config.N) print(np.mean(rewards))
def __init__(self, env, args): # Training Parameters self.args = args self.env = env self.batch_size = args.batch_size self.lr = args.lr self.gamma = args.gamma_reward_decay self.n_actions = env.action_space.n self.output_logs = args.output_logs self.step = 8e6 self.curr_step = 0 self.ckpt_path = args.save_dir self.epsilon = args.eps_start self.eps_end = args.eps_end self.target_update = args.update_target self.observe_steps = args.observe_steps self.explore_steps = args.explore_steps self.saver_steps = args.saver_steps self.resume = args.resume self.writer = TensorboardSummary(self.args.log_dir).create_summary() # Model Settings self.cuda = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.policy_net = DQN(4, self.n_actions) self.target_net = DQN(4, self.n_actions) self.target_net.load_state_dict(self.policy_net.state_dict()) if self.cuda: self.policy_net.to(self.cuda) self.target_net.to(self.cuda) self.target_net.eval() train_params = self.policy_net.parameters() self.optimizer = optim.RMSprop(train_params, self.lr, momentum=0.95, eps=0.01) self.memory = ReplayMemory(args.replay_memory_size) if args.resume: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) self.epsilon = checkpoint['epsilon'] self.curr_step = checkpoint['step'] self.policy_net.load_state_dict(checkpoint['policy_state_dict']) self.target_net.load_state_dict(checkpoint['target_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['episode']))
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # #Gym parameters self.num_actions = env.action_space.n # parameters for repaly buffer self.buffer_max_len = 20000 self.buffer = deque(maxlen=self.buffer_max_len) self.episode_reward_list = [] self.moving_reward_avg = [] # paramters for neural network self.batch_size = 32 self.gamma = 0.999 self.eps_threshold = 0 self.eps_start = 1 self.eps_end = 0.025 self.max_expisode_decay = 10000 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") #Training self.steps_done = 0 self.num_episode = 20000 self.target_update = 5000 self.learning_rate = 1.5e-4 # Neural Network self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) if args.test_dqn: #you can load your model here print('loading trained model') self.policy_net = torch.load('policy_net.hb5') self.policy_net.eval()
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) self.action = env.get_action_space() ########################### # YOUR IMPLEMENTATION HERE # self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', self.device) self.model = DQN().to(self.device) self.model_target = DQN().to(self.device) self.episode = 100000 self.max_steps_per_episode = 14000 self.update_target_network = 10000 self.epsilon = 1.0 self.min_epsilon = 0.1 self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6) self.env = env self.history = [] self.buffer_size = min(args.history_size // 5, 2000) self.history_size = args.history_size self.learning_rate = 1e-4 self.name = args.name self.batch_size = 32 self.gamma = 0.99 self.priority = [] self.w = 144 self.h = 256 self.mode = args.mode self.delay = args.delay self.epoch = args.continue_epoch if args.test_dqn or self.epoch > 0: #you can load your model here print('loading trained model') ########################### self.model.load_state_dict( torch.load(self.name + '.pth', map_location=self.device)) self.model_target.load_state_dict( torch.load(self.name + '.pth', map_location=self.device))
def train(): frame_idx = 0 ts_frame = 0 ts = time.time() env = gym.make(DEFAULT_ENV_NAME) model = DQN(env.observation_space.shape, env.action_space.n, LEARNING_RATE) state = env.reset() c = collections.Counter() buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START total_rewards = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) while True: frame_idx += 1 epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_FRAMES) reward = agent.play_step(sess, model, epsilon) if reward is not None: total_rewards.append(reward) speed = (frame_idx - ts_frame) / (time.time() - ts) ts_frame = frame_idx ts = time.time() mean_reward = np.mean(total_rewards[-100:]) print( "%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (frame_idx, len(total_rewards), mean_reward, epsilon, speed)) if len(buffer) < REPLAY_MIN_SIZE: continue batch = buffer.sample(BATCH_SIZE) print(len(batch[0])) agent.fit_batch(sess, model, batch)
def infer(env, mlp=False): action_size = env.action_space.n if mlp: policy_net = MLP(num_actions=action_size) else: policy_net = DQN() policy_net.load_state_dict( torch.load('saved_weights/Episode 2078-checkpoint.pth')) for i in range(3): env.reset() state = get_screen(env) for j in range(200): action = policy_net(state).max(1)[1].view(1, 1) env.render() _, reward, done, _ = env.step(action.item()) state = get_screen(env) if done: break env.close()
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.replay_buffer_size = 10000 self.start_to_learn = 5000 self.update_target_net = 5000 self.learning_rate = 1.5e-4 self.batch_size = 32 self.buffer_ = deque(maxlen=self.replay_buffer_size) self.epsilon = 0.99 self.epsilon_ = 0.2 if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # self.Net = DQN()
def _create_model(self): """ Create a deep Q model for function approximation with Adam optimizer. :return: net, tgt_net, optimizer """ tgt_net = DQN(self.env.observation_space.shape[0], self.nactions**2).to(self.device) if self.load_from is not None: assert type( self.load_from ) == str, "Name of model to be loaded has to be a string!" self.net.load_state_dict(torch.load(self.load_from)) tgt_net.load_state_dict(torch.load(self.load_from)) optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate) return tgt_net, optimizer
def __init__(self,state_size, action_size ,batch_size,learn_step_size,buffer_size ,gamma , learning_rate, tau ,seed): """ Intialize the agent and its learning parameter set. Parameters ========= state_size (int): Size of the state space action_size (int): Size of the action space batch_size (int): Size of the batch size used in each learning step learn_step_size (int): Number of steps until agent ist trained again buffer_size (int): Size of replay memory buffer gamma (float): Discount rate that scales future discounts learning_rate (float): Learning rate of neural network tau (float): Update strenght between local and target network seed (float): Random set for initialization """ # ----- Parameter init ----- # State and action size from environment self.state_size = state_size self.action_size = action_size # Replay buffer and learning properties self.batch_size = batch_size self.learn_step_size = learn_step_size self.gamma = gamma self.tau = tau # General self.seed = random.seed(seed) # ----- Network and memory init ----- # Init identical NN as local and target networks and set optimizer self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate) # Initialize replay memory and time step (for updating every learn_step_size steps) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) self.t_step = 0
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.memory = [] self.env = env self.n_actions = env.env.action_space.n self.policy_net = DQN(4, self.n_actions).to(device).float() self.target_net = DQN(4, self.n_actions).to(device).float() # self.policy_net.load_state_dict(torch.load("best_weights_model.pt")) self.target_net.load_state_dict(self.policy_net.state_dict()) self.eps_threshold = EPS_START self.args = args self.test_count = 0 self.max_reward = 0 self.max_reward_so_far = 0 self.reward_buffer = [] self.flag = 0 self.steps_done = 0 # self.target_net.eval() self.transition = [] self.test_mean_reward = 0 self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE) if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # self.policy_net.load_state_dict( torch.load("best_weights_model.pt", map_location=torch.device('cpu')))
def main(env, mlp=False): action_size = env.action_space.n if mlp: policy_net = MLP(num_actions=action_size) target_net = MLP(num_actions=action_size) else: policy_net = DQN() target_net = DQN() target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop( policy_net.parameters(), lr=0.00025) # optim.Adam(policy_net.parameters(), lr=5e-4) memory = ReplayMemory(int(1e5)) # print('done') scores = train_dqn(env=env, policy_net=policy_net, target_net=target_net, memory=memory, GAMMA=GAMMA, BATCH_SIZE=BATCH_SIZE, EPS_START=EPS_START, EPS_END=EPS_END, EPS_DECAY=EPS_DECAY, TARGET_UPDATE=TARGET_UPDATE, n_episodes=n_episodes, get_screen=get_screen, optimizer=optimizer) # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() print('Complete')
def __init__(self, env, args): """ Initialize every things you need here. For example: building your model """ super(Agent_DQN, self).__init__(env) self.env = env self.args = args self.episode = 0 self.n_actions = self.env.action_space.n self.epsilon_start = 1.0 self.epsilon_final = 0.025 self.epsilon_decay = 3000 self.epsilon_by_frame = lambda frame_idx: self.epsilon_final + ( self.epsilon_start - self.epsilon_final) * math.exp( -1. * frame_idx / self.epsilon_decay) self.epsilon = 0 self.eval_net = DQN().cuda() self.target_net = DQN().cuda() self.target_net.load_state_dict(self.eval_net.state_dict()) self.criterion = nn.MSELoss() #self._model = Net(self.env.observation_space.shape, self.env.action_space.n) self._use_cuda = torch.cuda.is_available() self.optim = torch.optim.Adam(self.eval_net.parameters(), lr=self.args.learning_rate) if self._use_cuda: self.eval_net = self.eval_net.cuda() self.target_net = self.target_net.cuda() self.criterion = self.criterion.cuda() # self.replaybuffer = ReplayBuffer(args.buffer_size) self.buffer = deque(maxlen=10000) if args.test_dqn: #you can load your model here print('loading trained model') self.eval_net.load_state_dict(torch.load(args.model_dqn)) self.target_net.load_state_dict(self.eval_net.state_dict()) if self._use_cuda: self.eval_net = self.eval_net.cuda() self.target_net = self.target_net.cuda()
def __init__(self, env, test=False): self.cuda = torch.device('cuda') print("Using device: " + torch.cuda.get_device_name(self.cuda), flush=True) self.env = env self.state_shape = env.observation_space.shape self.n_actions = env.action_space.n self.memory = deque(maxlen=100000) self.batch_size = 32 self.mem_threshold = 50000 self.gamma = 0.99 self.learning_rate = 1e-4 self.epsilon = 1.0 self.epsilon_min = 0.05 self.epsilon_period = 10000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.epsilon_period self.update_rate = 4 self.omega = 100 self.start_epoch = 1 self.epochs = 1 self.epoch = 20000 self.model = DQN(self.state_shape, self.n_actions).to(self.cuda) print("DQN parameters: {}".format(count_parameters(self.model))) self.target = DQN(self.state_shape, self.n_actions).to(self.cuda) self.target.eval() self.target_update = 10000 self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) if test: self.model.load_state_dict(torch.load('model.pt'))
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN,self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.epochs = 10 self.n_episodes = 1000000 self.env = env self.nA = self.env.action_space.n # self.nS = self.env.observation_space self.batch_size = 32 self.DQN = DQN() self.Target_DQN = DQN() self.buffer_memory = 1000000 self.train_buffer_size = 4 self.min_buffer_size = 10000 self.target_update_buffer = 10000 self.learning_rate = 0.0001 self.discount_factor = 0.999 self.epsilon = 1 self.min_epsilon = 0.01 # self.decay_rate = 0.999 self.ep_decrement = (self.epsilon - self.min_epsilon)/self.n_episodes self.criteria = nn.MSELoss() self.optimiser = optim.Adam(self.DQN.parameters(),self.learning_rate) self.buffer=[] self.Evaluation = 100000 self.total_evaluation__episodes = 100 self.full_train = 100000 if args.test_dqn: #you can load your model here print('loading trained model')
def __init__(self, FLAGS): self.FLAGS = FLAGS self.env = gym.make('CartPole-v1') self.state_size = len(self.env.observation_space.sample()) self.num_episodes = 1000 self.exp_replay = ExperienceReplay() target_network = DQN(scope='target', env=self.env, target_network=None, flags=FLAGS, exp_replay=None) self.q_network = DQN(scope='q_network', env=self.env, target_network=target_network, flags=FLAGS, exp_replay=self.exp_replay) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) self.q_network.set_session(session) target_network.set_session(session)
def __init__(self, env, args): """ Initialize every things you need here. For example: building your model """ super(Agent_DQN, self).__init__(env) if args.test_dqn: # you can load your model here print('loading trained model') ################## # YOUR CODE HERE # ################## self.env = env self.batch_size = BATCH_SIZE self.gamma = GAMMA self.eps_start = EPS_START self.eps_decay = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.policy_net = DQN(self.env.action_space.n) self.target_net = DQN(self.env.action_space.n) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.policy_net.to(device) self.target_net.to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1.5e-4) self.memory = ReplayMemory(10000) if args.test_dqn: # you can load your model here print('loading trained model') self.policy_net.load_state_dict( torch.load(os.path.join('save_dir/' 'model-best.pth'), map_location=torch.device('cpu'))) self.policy_net.eval()
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN,self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.epsilon_start = 1 self.epsilon_end = 0.02 self.epsilon_decay = 200000 self.epsilon = self.epsilon_start self.gamma = 0.99 self.env = env self.buffer_size = 30000 self.buffer = deque(maxlen=30000) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = torch.optim.Adam(self.policy_net.parameters(),lr=0.00015) self.reward_array = [] self.reward_x_axis = [] self.batch_size = 32 if args.test_dqn: #you can load your model here print('loading trained model')
class Agent_DQN(): def __init__(self, env, args): # Parameters for q-learning super(Agent_DQN, self).__init__() self.env = env state = env.reset() state = state.transpose(2, 0, 1) self.policy_net = DQN(state.shape, self.env.action_space.n) # Behavior Q self.target_net = DQN(state.shape, self.env.action_space.n) # Target Q self.target_net.load_state_dict(self.policy_net.state_dict()) #Initial Q if USE_CUDA: print("Using CUDA . . . ") self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() print('hyperparameters and network initialized') if args.test_dqn or LOAD == True: print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.target_net.load_state_dict(self.policy_net.state_dict()) def init_game_setting(self): print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) memory.append((state, action, reward, next_state, done)) def replay_buffer(self): state, action, reward, next_state, done = zip( *random.sample(memory, batch_size)) return np.concatenate(state), action, reward, np.concatenate( next_state), done def __len__(self): return len(self.buffer) def make_action(self, observation, test=True): observation = observation.transpose(2, 0, 1) if np.random.random() > EPSILON or test == True: observation = Variable(torch.FloatTensor( np.float32(observation)).unsqueeze(0), volatile=True) q_value = self.policy_net.forward(observation) action = q_value.max(1)[1].data[0] action = int(action.item()) else: action = random.randrange(4) return action def optimize_model(self): states, actions, next_states, rewards, dones = self.replay_buffer() states_v = Variable(torch.FloatTensor(np.float32(states))) next_states_v = Variable(torch.FloatTensor(np.float32(next_states)), volatile=True) actions_v = Variable(torch.LongTensor(actions)) rewards_v = Variable(torch.FloatTensor(rewards)) done = Variable(torch.FloatTensor(dones)) state_action_values = self.policy_net(states_v).gather( 1, actions_v.unsqueeze(1)).squeeze(1) next_state_values = self.target_net(next_states_v).max(1)[0] expected_q_value = rewards_v + next_state_values * GAMMA * ( 1 - done) #+ rewards_v loss = (state_action_values - Variable(expected_q_value.data)).pow(2).mean() return loss def train(self): optimizer = optim.Adam(self.policy_net.parameters(), lr=ALPHA) # Fill the memory with experiences print('Gathering experiences ...') meanScore = 0 AvgRewards = [] AllScores = [] step = 1 iEpisode = 0 while meanScore < 50: state = self.env.reset() done = False EpisodeScore = 0 tBegin = time.time() done = False while not done: action = self.make_action(state) nextState, reward, done, _ = self.env.step(action) self.push(state.transpose(2, 0, 1), action, nextState.transpose(2, 0, 1), reward, done) state = nextState if len(memory) > StartLearning: loss = self.optimize_model() optimizer.zero_grad() loss.backward() optimizer.step() else: iEpisode = 0 continue # Update exploration factor EPSILON = EPS_END + (EPS_START - EPS_END) * math.exp( -1. * step / EPS_DECAY) storeEpsilon.append(EPSILON) step += 1 EpisodeScore += reward if step % TARGET_UPDATE == 0: print('Updating Target Network . . .') self.target_net.load_state_dict( self.policy_net.state_dict()) iEpisode += 1 AllScores.append(EpisodeScore) meanScore = np.mean(AllScores[-100:]) AvgRewards.append(meanScore) if len(memory) > StartLearning: print('Episode: ', iEpisode, ' score:', EpisodeScore, ' Avg Score:', meanScore, ' epsilon: ', EPSILON, ' t: ', time.time() - tBegin, ' loss:', loss.item()) else: print('Gathering Data . . .') if iEpisode % 500 == 0: torch.save( { 'epoch': iEpisode, 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'AvgRewards': AvgRewards }, 'trainData') os.remove("Rewards.csv") with open('Rewards.csv', mode='w') as dataFile: rewardwriter = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rewardwriter.writerow(AvgRewards) print('======== Complete ========') torch.save( { 'epoch': iEpisode, 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'AvgRewards': AvgRewards }, 'trainData') with open('Rewards.csv', mode='w') as dataFile: rewardwriter = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rewardwriter.writerow(AvgRewards)
experience_replay_buffer = ReplayMemory() episode_rewards = np.zeros(num_episodes) episode_lens = np.zeros(num_episodes) epsilon = 1.0 epsilon_min = 0.1 epsilon_change = (epsilon - epsilon_min) / 500000 env = gym.make('gym_quantum_pong:Quantum_Pong-v0') #monitor_dir = 'video' #env = wrappers.Monitor(env, monitor_dir) model = DQN(K=K, conv_layer_sizes=conv_layer_sizes, hidden_layer_sizes=hidden_layer_sizes, scope="model", image_size=IM_SIZE) target_model = DQN(K=K, conv_layer_sizes=conv_layer_sizes, hidden_layer_sizes=hidden_layer_sizes, scope="target_model", image_size=IM_SIZE) image_transformer = ImageTransformer(IM_SIZE) with tf.Session() as sess: model.set_session(sess) target_model.set_session(sess) #model.load()
class Simulation: """ Simulation for the game of 3D Pong. Parameters ---------- params: dict Dictionary of all the simulation parameters """ def __init__(self, params, player_n=0): # unpack the parameters: #### simulation self.device = params["device"] self.env_name = params["env_name"] self.training_frames = params["training_frames"] self.skip_frames = params["skip_frames"] self.nactions = params["nactions"] self.messages_enabled = params["messages_enabled"] self.selfplay = params["selfplay"] #### qnet model self.learning_rate = params["learning_rate"] self.sync = params["sync"] self.load_from = params["load_from"] #### buffer self.batch_size = params["batch_size"] self.replay_size = params["replay_size"] self.nstep = params["nstep"] #### agent model self.gamma = params["gamma"] self.eps_start = params["eps_start"] self.eps_end = params["eps_end"] self.eps_decay_rate = params["eps_decay_rate"] self.player_n = player_n self.double = params["double"] # initialize the simulation with shared properties self.env = gym.make( self.env_name ) # environment, agent etc. can"t be created jointly in a server simulation self.net = DQN(self.env.observation_space.shape[0], self.nactions**2).to(self.device) def _create_environment(self): """ create a gym environment for the simulation. Actions are discretized into nactions and frames are skipped for faster training :return: env """ env = gym.make(self.env_name) if self.selfplay: env.unwrapped.multiplayer(env, game_server_guid="selfplayer", player_n=self.player_n) env = wrappers.action_space_discretizer(env, n=self.nactions) env = wrappers.SkipEnv(env, skip=self.skip_frames) return env def _create_agent(self, env): """ Create agent with buffer for the simulation. :return: agent """ # buffer = ExperienceBuffer(self.replay_size) buffer = Extendedbuffer(self.replay_size, nstep=self.nstep, gamma=self.gamma) agent = pongagent.Pongagent(env, self.player_n, buffer) return agent def _create_model(self): """ Create a deep Q model for function approximation with Adam optimizer. :return: net, tgt_net, optimizer """ tgt_net = DQN(self.env.observation_space.shape[0], self.nactions**2).to(self.device) if self.load_from is not None: assert type( self.load_from ) == str, "Name of model to be loaded has to be a string!" self.net.load_state_dict(torch.load(self.load_from)) tgt_net.load_state_dict(torch.load(self.load_from)) optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate) return tgt_net, optimizer def _init_non_shared(self, player_n): env = self._create_environment() tgt_net, optimizer = self._create_model() agent = self._create_agent(env) writer = SummaryWriter( comment="-" + "player" + str(player_n) + "batch" + str(self.batch_size) + "_n" + str(env.action_space.n) + "_eps" + str(self.eps_decay_rate) + "_skip" + str(self.skip_frames) + "learning_rate" + str(self.learning_rate)) return env, agent, tgt_net, optimizer, writer def _fill_buffer(self, agent): if self.messages_enabled: print("Player populating Buffer ...") agent.exp_buffer.fill(agent.env, self.replay_size, self.nstep) if self.messages_enabled: print("Buffer_populated!") def train(self, net, player_n=0): self.net = net env, agent, tgt_net, optimizer, writer = self._init_non_shared( player_n) self._fill_buffer(agent) if self.messages_enabled: print("Player %i start training: " % player_n) reward = [] for frame in range(self.training_frames): epsilon = max(self.eps_end, self.eps_start - frame / self.eps_decay_rate) ep_reward = agent.play_step(net, epsilon, self.device) if ep_reward: reward.append(ep_reward) writer.add_scalar("episode_reward", ep_reward, frame) writer.add_scalar("mean100_reward", np.mean(reward[-100:]), frame) if (frame % self.sync) == 0: tgt_net.load_state_dict( net.state_dict()) # Syncs target and Standard net if self.messages_enabled: print("We are at: %7i / %7i frames" % (frame, self.training_frames)) if player_n == 0: torch.save(net.state_dict(), self.env_name + "-time_update.dat") optimizer.zero_grad() batch = agent.exp_buffer.sample(self.batch_size) loss_t = calc_loss(batch, net, tgt_net, self.gamma**self.nstep, self.double, self.device) loss_t.backward() optimizer.step() writer.add_scalar("loss", loss_t, frame) writer.add_scalar("epsilon", epsilon, frame) writer.close() if self.messages_enabled: print("Player %i end training!" % player_n) torch.save(net.state_dict(), self.env_name + "end_of_training.dat") return np.mean(reward[-len(reward) // 2:]) # TODO: clean this function! def run(self, mode="play"): """ runs the simulation. :param mode: str, either "play" or "train" :return: mean reward over all episodes with eps_end """ if mode == "train": reward = self.train(self.net) return reward elif mode == "play": # Run play.py to see model in action pass else: raise Exception("Mode should be either play or train")
class Agent_DQN(): def __init__(self, env, test=False): self.cuda = torch.device('cuda') print("Using device: " + torch.cuda.get_device_name(self.cuda), flush=True) self.env = env self.state_shape = env.observation_space.shape self.n_actions = env.action_space.n self.memory = deque(maxlen=100000) self.batch_size = 32 self.mem_threshold = 50000 self.gamma = 0.99 self.learning_rate = 1e-4 self.epsilon = 1.0 self.epsilon_min = 0.05 self.epsilon_period = 10000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.epsilon_period self.update_rate = 4 self.start_epoch = 1 self.epochs = 10 self.epoch = 10000 self.model = DQN(self.state_shape, self.n_actions).to(self.cuda) print("DQN parameters: {}".format(count_parameters(self.model))) self.target = DQN(self.state_shape, self.n_actions).to(self.cuda) self.target.eval() self.target_update = 10000 self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) if test: self.model.load_state_dict(torch.load('model.pt')) def init_game_setting(self): pass def make_action(self, observation, test=False): epsilon = 0.01 if test else self.epsilon # turn action into tensor observation = torch.tensor(observation, device=self.cuda, dtype=torch.float) # turn off learning self.model.eval() # epsilon greedy policy if random.random() > epsilon: # no need to calculate gradient with torch.no_grad(): # choose highest value action b = self.model(observation) b = b.cpu().data.numpy() action = np.random.choice( np.flatnonzero(np.isclose(b, b.max()))) else: # random action action = random.choice(np.arange(self.n_actions)) # turn learning back on self.model.train() return action def replay_buffer(self): # Return tuple of sars transitions states, actions, rewards, next_states, dones = zip( *random.sample(self.memory, self.batch_size)) states = torch.tensor(np.vstack(states), device=self.cuda, dtype=torch.float) actions = torch.tensor(np.array(actions), device=self.cuda, dtype=torch.long) rewards = torch.tensor(np.array(rewards, dtype=np.float32), device=self.cuda, dtype=torch.float) next_states = torch.tensor(np.vstack(next_states), device=self.cuda, dtype=torch.float) dones = torch.tensor(np.array(dones, dtype=np.float32), device=self.cuda, dtype=torch.float) return states, actions, rewards, next_states, dones def experience_replay(self, n=0): # clamp gradient clamp = False # Reset gradient (because it accumulates by default) self.optimizer.zero_grad() # sample experience memory states, actions, rewards, next_states, dones = self.replay_buffer() # get Q(s,a) for sample Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1) # get max_a' Q(s',a') Q_prime = self.target(next_states).detach().max(1)[0] # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states Y = rewards + (self.gamma * Q_prime) * (1 - dones) # Huber loss of Q and Y loss = F.smooth_l1_loss(Q, Y) # Compute dloss/dx loss.backward() # Clamp gradient if clamp: for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) # Change the weights self.optimizer.step() def train(self): step = 0 learn_step = 0 print("Begin Training:", flush=True) learn_curve = [] last30 = deque(maxlen=30) for epoch in range(self.start_epoch, self.epochs + 1): durations = [] rewards = [] flag = [] # progress bar epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200) for episode in epoch_bar: # reset state state = self.env.reset() # decay epsilon if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay # run one episode done = False ep_duration = 0 ep_reward = 0 while not done: step += 1 ep_duration += 1 # get epsilon-greedy action action = self.make_action(state) # do action next_state, reward, done, info = self.env.step(action) ep_reward += reward # add transition to replay memory self.memory.append( Transition(state, action, reward, next_state, done)) state = next_state # learn from experience, if available if step % self.update_rate == 0 and len( self.memory) > self.mem_threshold: self.experience_replay(learn_step) learn_step += 1 # update target network if step % self.target_update == 1: self.target.load_state_dict(self.model.state_dict()) durations.append(ep_duration) rewards.append(ep_reward) last30.append(ep_reward) learn_curve.append(np.mean(last30)) flag.append(info['flag_get']) epoch_bar.set_description( "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}" .format(epoch, self.epochs, np.mean(durations), np.mean(rewards), learn_curve[-1])) # save model every epoch plt.clf() plt.plot(learn_curve) plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward") plt.xlabel('Episodes') plt.ylabel('Moving Average Reward') if not os.path.exists(f"{save_prefix}_DQN"): os.mkdir(f"{save_prefix}_DQN") torch.save(self.model.state_dict(), f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt') pickle.dump( rewards, open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb')) pickle.dump(flag, open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb')) plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png") learn_curve = []
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### model = DQN(in_channels=input_arg, num_actions=num_actions) target_Q = DQN(in_channels=input_arg, num_actions=num_actions) if USE_CUDA: target_Q = target_Q.cuda() model = model.cuda() ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(model.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### OUR CODE idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() if t > learning_starts: action = select_epilson_greedy_action(model, encoded_obs, t)[0] else: action = random.randrange(num_actions) obs, reward, done, info = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(idx, action, reward, done) if done: obs = env.reset() last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) act_batch = Variable( torch.Tensor(act_batch).type(torch.LongTensor)) rew_batch = Variable(torch.from_numpy(rew_batch)) done_mask = Variable( torch.Tensor([1. if val == 0 else 0. for val in done_mask])) if USE_CUDA: done_mask = done_mask.cuda() act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() obs_batch = obs_batch.cuda() next_obs_batch = next_obs_batch.cuda() # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # We choose Q based on action taken. current_Q_values = model(obs_batch).gather( 1, act_batch.unsqueeze(1)) #[0, act_batch] # 5. Obtain maxQ' and set our target value for chosen action using the bellman equation. next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = torch.mul(done_mask, next_max_q) target_Q_values = rew_batch + (gamma * next_Q_values) if USE_CUDA: target_Q_values = target_Q_values.cuda() d_error = target_Q_values.unsqueeze(1) - current_Q_values d_error = d_error.clamp(-1, 1) * -1 # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. optimizer.zero_grad() current_Q_values.backward(d_error) optimizer.step() num_param_updates += 1 # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(model.state_dict()) # YOUR CODE HERE ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if hasattr(exploration, 'add_reward'): exploration.add_reward(episode_rewards) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # # Declare variables self.exp_id = uuid.uuid4().__str__().replace('-', '_') self.args = args self.env = env self.eps_threshold = None self.nA = env.action_space.n self.action_list = np.arange(self.nA) self.reward_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.max_q_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.loss_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.probability_list = np.zeros(env.action_space.n, np.float32) self.cur_eps = self.args.eps self.t = 0 self.ep_len = 0 self.mode = None if self.args.use_pri_buffer: self.replay_buffer = NaivePrioritizedBuffer( capacity=self.args.capacity, args=self.args) else: self.replay_buffer = ReplayBuffer(capacity=self.args.capacity, args=self.args) self.position = 0 self.args.save_dir += f'/{self.exp_id}/' os.system(f"mkdir -p {self.args.save_dir}") self.meta = MetaData(fp=open( os.path.join(self.args.save_dir, 'result.csv'), 'w'), args=self.args) self.eps_delta = (self.args.eps - self.args.eps_min) / self.args.eps_decay_window self.beta_by_frame = lambda frame_idx: min( 1.0, args.pri_beta_start + frame_idx * (1.0 - args.pri_beta_start) / args.pri_beta_decay) # Create Policy and Target Networks if self.args.use_dueling: print("Using dueling dqn . . .") self.policy_net = DuelingDQN(env, self.args).to(self.args.device) self.target_net = DuelingDQN(env, self.args).to(self.args.device) elif self.args.use_crnn: print("Using dueling crnn . . .") self.policy_net = CrnnDQN(env).to(self.args.device) self.target_net = CrnnDQN(env).to(self.args.device) else: self.policy_net = DQN(env, self.args).to(self.args.device) self.target_net = DQN(env, self.args).to(self.args.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.args.lr, eps=self.args.optimizer_eps) if self.args.lr_scheduler: print("Enabling LR Decay . . .") self.scheduler = optim.lr_scheduler.ExponentialLR( optimizer=self.optimizer, gamma=self.args.lr_decay) self.cur_lr = self.optimizer.param_groups[0]['lr'] # Compute Huber loss self.loss = F.smooth_l1_loss # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370 self.policy_net.share_memory() self.target_net.share_memory() # Set defaults for networks self.policy_net.train() self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) if args.test_dqn: # you can load your model here ########################### # YOUR IMPLEMENTATION HERE # print('loading trained model') self.load_model() if args.use_pri_buffer: print('Using priority buffer . . .') if args.use_double_dqn: print('Using double dqn . . .') if args.use_bnorm: print("Using batch normalization . . .") print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n')
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # # Declare variables self.exp_id = uuid.uuid4().__str__().replace('-', '_') self.args = args self.env = env self.eps_threshold = None self.nA = env.action_space.n self.action_list = np.arange(self.nA) self.reward_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.max_q_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.loss_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.probability_list = np.zeros(env.action_space.n, np.float32) self.cur_eps = self.args.eps self.t = 0 self.ep_len = 0 self.mode = None if self.args.use_pri_buffer: self.replay_buffer = NaivePrioritizedBuffer( capacity=self.args.capacity, args=self.args) else: self.replay_buffer = ReplayBuffer(capacity=self.args.capacity, args=self.args) self.position = 0 self.args.save_dir += f'/{self.exp_id}/' os.system(f"mkdir -p {self.args.save_dir}") self.meta = MetaData(fp=open( os.path.join(self.args.save_dir, 'result.csv'), 'w'), args=self.args) self.eps_delta = (self.args.eps - self.args.eps_min) / self.args.eps_decay_window self.beta_by_frame = lambda frame_idx: min( 1.0, args.pri_beta_start + frame_idx * (1.0 - args.pri_beta_start) / args.pri_beta_decay) # Create Policy and Target Networks if self.args.use_dueling: print("Using dueling dqn . . .") self.policy_net = DuelingDQN(env, self.args).to(self.args.device) self.target_net = DuelingDQN(env, self.args).to(self.args.device) elif self.args.use_crnn: print("Using dueling crnn . . .") self.policy_net = CrnnDQN(env).to(self.args.device) self.target_net = CrnnDQN(env).to(self.args.device) else: self.policy_net = DQN(env, self.args).to(self.args.device) self.target_net = DQN(env, self.args).to(self.args.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.args.lr, eps=self.args.optimizer_eps) if self.args.lr_scheduler: print("Enabling LR Decay . . .") self.scheduler = optim.lr_scheduler.ExponentialLR( optimizer=self.optimizer, gamma=self.args.lr_decay) self.cur_lr = self.optimizer.param_groups[0]['lr'] # Compute Huber loss self.loss = F.smooth_l1_loss # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370 self.policy_net.share_memory() self.target_net.share_memory() # Set defaults for networks self.policy_net.train() self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) if args.test_dqn: # you can load your model here ########################### # YOUR IMPLEMENTATION HERE # print('loading trained model') self.load_model() if args.use_pri_buffer: print('Using priority buffer . . .') if args.use_double_dqn: print('Using double dqn . . .') if args.use_bnorm: print("Using batch normalization . . .") print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n') def init_game_setting(self): pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # with torch.no_grad(): if self.args.test_dqn: q, argq = self.policy_net( Variable( self.channel_first(observation))).data.cpu().max(1) return self.action_list[argq] # Fill up probability list equal for all actions self.probability_list.fill(self.cur_eps / self.nA) # Fetch q from the model prediction q, argq = self.policy_net(Variable( self.channel_first(observation))).data.cpu().max(1) # Increase the probability for the selected best action self.probability_list[argq[0].item()] += 1 - self.cur_eps # Use random choice to decide between a random action / best action action = torch.tensor( [np.random.choice(self.action_list, p=self.probability_list)]) ########################### return action.item(), q.item() def optimize_model(self): """ Function to perform optimization on DL Network :return: Loss """ # Return if initial buffer is not filled. if len(self.replay_buffer.memory) < self.args.mem_init_size: return 0 if self.args.use_pri_buffer: batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample( self.args.batch_size, beta=self.beta_by_frame(self.t)) else: batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample( self.args.batch_size) batch_state = Variable( self.channel_first( torch.tensor(np.array(batch_state), dtype=torch.float32))) batch_action = Variable( torch.tensor(np.array(batch_action), dtype=torch.long)) batch_next_state = Variable( self.channel_first( torch.tensor(np.array(batch_next_state), dtype=torch.float32))) batch_reward = Variable( torch.tensor(np.array(batch_reward), dtype=torch.float32)) batch_done = Variable( torch.tensor(np.array(batch_done), dtype=torch.float32)) policy_max_q = self.policy_net(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) if self.args.use_double_dqn: policy_ns_max_q = self.policy_net(batch_next_state) next_q_value = self.target_net(batch_next_state).gather( 1, torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1) target_max_q = next_q_value * self.args.gamma * (1 - batch_done) else: target_max_q = self.target_net(batch_next_state).detach().max( 1)[0].squeeze(0) * self.args.gamma * (1 - batch_done) # Compute Huber loss if self.args.use_pri_buffer: loss = (policy_max_q - (batch_reward + target_max_q.detach())).pow(2) * Variable( torch.tensor(weights, dtype=torch.float32)) prios = loss + 1e-5 loss = loss.mean() else: loss = self.loss(policy_max_q, batch_reward + target_max_q) # Optimize the model self.optimizer.zero_grad() loss.backward() # Clip gradients between -1 and 1 for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) if self.args.use_pri_buffer: self.replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) self.optimizer.step() return loss.cpu().detach().numpy() def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # def train_fn(): self.t = 1 self.mode = "Random" train_start = time.time() if not self.args.load_dir == '': self.load_model() for i_episode in range(1, self.args.max_episodes + 1): # Initialize the environment and state start_time = time.time() state = self.env.reset() self.reward_list.append(0) self.loss_list.append(0) self.max_q_list.append(0) self.ep_len = 0 done = False # Save Model self.save_model(i_episode) # Collect garbage self.collect_garbage(i_episode) # Run the game while not done: # Update the target network, copying all weights and biases in DQN if self.t % self.args.target_update == 0: print("Updating target network . . .") self.target_net.load_state_dict( self.policy_net.state_dict()) # Select and perform an action self.cur_eps = max(self.args.eps_min, self.cur_eps - self.eps_delta) if self.cur_eps == self.args.eps_min: self.mode = 'Exploit' else: self.mode = "Explore" action, q = self.make_action(state) next_state, reward, done, _ = self.env.step(action) self.reward_list[-1] += reward self.max_q_list[-1] = max(self.max_q_list[-1], q) # Store the transition in memory self.replay_buffer.push(state, action, next_state, reward, done) self.meta.update_step(self.t, self.cur_eps, self.reward_list[-1], self.max_q_list[-1], self.loss_list[-1], self.cur_lr) # Increment step and Episode Length self.t += 1 self.ep_len += 1 # Move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.ep_len % self.args.learn_freq == 0: loss = self.optimize_model() self.loss_list[-1] += loss self.loss_list[-1] /= self.ep_len # Decay Step: if self.args.lr_scheduler: self.cur_lr = self.scheduler.get_lr()[0] if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min: self.scheduler.step(i_episode) # Update meta self.meta.update_episode( i_episode, self.t, time.time() - start_time, time.time() - train_start, self.ep_len, len(self.replay_buffer.memory), self.cur_eps, self.reward_list[-1], np.mean(self.reward_list), self.max_q_list[-1], np.mean(self.max_q_list), self.loss_list[-1], np.mean(self.loss_list), self.mode, self.cur_lr) import multiprocessing as mp processes = [] for rank in range(4): p = mp.Process(target=train_fn) p.start() processes.append(p) for p in processes: p.join()