def __init__(self, q_network, q_network2, preprocessor: Preprocessor(), RLmemory: ReplayMemory(), SLmemory: ReplayMemory(), policy, gamma, target_update_freq, num_burn_in, train_freq, batch_size, algorithm='DoubuleDQN', render=False): self.net = q_network self.net2 = q_network2 self.pre = preprocessor self.rl_mem = RLmemory self.sl_mem = SLmemory self.policy = policy self.gamma = gamma self.renew = target_update_freq self.burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size self.algorithm = algorithm self.render = render
def __init__(self, clip_grad=True, num_episodes=50, trajectory_len=MAX_STEPS, custom_func=None, custom_func_args=None ): ''' Initialization :param clip_grad: bool: flag for clipping gradients with value 1 :param num_episodes: number of episodes to run :param trajectory_len: maximal number of steps in each trajectory :param custom_func: custom reward function :param custom_func_args: custom reward function arguments ''' self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.build_nn() self.num_episodes = num_episodes self.trajectory_len = trajectory_len self.model = DQN() self.replay = ReplayMemory(10000) self.steps_done = 0 self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.clip_grad=clip_grad self.rewards = [] self.modules = [] self.env = ChainAgent(inventory_level=10, fix_delay=1, max_num_steps=MAX_STEPS + 10, demand_generation_function=self.demand_generation_function, custom_func=custom_func, custom_func_args=custom_func_args)
def sample_from_Replay_Memory(self, batches, ReplayMemory, Net): current_states = [] actions = [] q_values = [] for samples in ReplayMemory.sample(batches): state, action, reward, next_state, is_done = [samples.state, samples.action, samples.reward, samples.next_state, samples.done] next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0) current_states.append(state) actions.append(action) target = reward if not is_done: target = reward + self.gamma * np.amax(Net.predict(next_state)[0]) target_f = Net.predict(state)[0] target_f[action] = target q_values.append(target_f) current_states = np.reshape(current_states, (-1, DIM_STATES)) q_values = np.reshape((q_values), (-1, NUM_ACTIONS)) return current_states, actions, q_values
def __init__(self, num_episodes=NUM_EPISODES, trajectory_len=MAX_STEPS, clip_grad=True): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_episodes = num_episodes self.trajectory_len = trajectory_len self.replay = ReplayMemory(10000) self.steps_done = 0 self.lr = 1e-3 self.actor = Actor() self.optimizer = optim.Adam(self.actor.parameters(), lr=self.lr) self.clip_grad = clip_grad self.trajectory = [] self.trajectories = [] self.optimize_each = 5 self.gamma = 0.99 self.env = ChainAgent( inventory_level=10, fix_delay=1, max_num_steps=MAX_STEPS + 10, demand_generation_function=self.demand_generation_function)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description="Run DQN on iLOCuS") parser.add_argument("--network_name", default="deep_q_network", type=str, help="Type of model to use") parser.add_argument("--batch_size", default=32, type=int, help="Batch size") parser.add_argument("--map_shape", default=(15, 15), type=tuple, help="map size") parser.add_argument("--num_actions", default=4, type=int, help="level of pricing") parser.add_argument("--gamma", default=0.8, type=float, help="Discount factor") parser.add_argument("--alpha", default=0.0001, type=float, help="Learning rate") parser.add_argument("--epsilon", default=0.5, type=float, help="Exploration probability for epsilon-greedy") parser.add_argument("--target_update_freq", default=10000, type=int, help="Frequency for copying weights to target network") parser.add_argument( "--num_iterations", default=5000000, type=int, help="Number of overal interactions to the environment") parser.add_argument("--max_episode_length", default=200000, type=int, help="Terminate earlier for one episode") parser.add_argument("--train_freq", default=4, type=int, help="Frequency for training") parser.add_argument("--num-burn-in", default=10000, type=int, help="number of memory before train") parser.add_argument("-o", "--output", default="ilocus-v0", type=str, help="Directory to save data to") parser.add_argument("--seed", default=0, type=int, help="Random seed") parser.add_argument("--train", default=True, type=bool, help="Train/Evaluate, set True if train the model") parser.add_argument("--model_path", default="atari-v0", type=str, help="specify model path to evaluation") parser.add_argument("--max_grad", default=1.0, type=float, help="Parameter for huber loss") parser.add_argument("--log_dir", default="log", type=str, help="specify log folder to save evaluate result") parser.add_argument( "--flip_coin", default=False, type=str, help="specify whether or not choosing double q learning") parser.add_argument("--eval_num", default=100, type=int, help="number of evaluation to run") parser.add_argument("--save_freq", default=100000, type=int, help="model save frequency") # memory related args parser.add_argument("--buffer_size", default=100000, type=int, help="reply memory buffer size") parser.add_argument( "--look_back_steps", default=4, type=int, help="how many previous pricing tables will be fed into RL") args = parser.parse_args() print("\nParameters:") for arg in vars(args): print(arg, getattr(args, arg)) # Initiating policy for both tasks (training and evaluating) policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0.1, 1000000, args.num_actions) if not args.train: '''Evaluate the model''' # check model path if args.model_path is '': print("Model path must be set when evaluate") exit(1) # specific log file to save result log_file = os.path.join(args.log_dir, args.network_name, str(args.model_num)) model_dir = os.path.join(args.model_path, args.network_name, str(args.model_num)) with tf.Session() as sess: # load model # with open(model_dir + ".json", 'r') as json_file: # loaded_model_json = json_file.read() # q_network_online = model_from_json(loaded_model_json) # q_network_target = model_from_json(loaded_model_json) # # sess.run(tf.global_variables_initializer()) # # # load weights into model # q_network_online.load_weights(model_dir + ".h5") # q_network_target.load_weights(model_dir + ".h5") driver_sim = DriverSim() env = Environment(driver_sim=driver_sim) memory = ReplayMemory(args.buffer_size, args.look_back_steps) q_network = create_model(args.look_back_steps, args.map_shape, args.num_actions) dqn_agent = DQNAgent(q_network=q_network, memory=memory, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size) exit(0) '''Train the model''' with tf.Session() as sess: # with tf.device('/cpu:0'): print("created model") driver_sim = DriverSim() env = Environment(driver_sim=driver_sim) print("set up environment") # # create output dir, meant to pop up error when dir exist to avoid over written # os.mkdir(args.output + "/" + args.network_name) memory = ReplayMemory(args.buffer_size, args.look_back_steps) q_network = create_model(args.look_back_steps, args.map_shape, args.num_actions) dqn_agent = DQNAgent(q_network=q_network, memory=memory, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size) print("defined dqn agent") optimizer = Adam(learning_rate=args.alpha) q_network.compile(optimizer, mean_huber_loss) sess.run(tf.global_variables_initializer()) print("initializing environment") env.reset() print("in fit") if os.path.exists(args.output): shutil.rmtree(args.output) os.mkdir(args.output) dqn_agent.fit(env=env, num_iterations=args.num_iterations, output_dir=os.path.join(args.output), max_episode_length=args.max_episode_length)
#initialize neural network to store policy #load environment env = gym.make('SpaceInvaders-v0') #make the q_network q_network_online = Sequential() q_network_target = Sequential() #make the preprocessors history_preproc = HistoryPreprocessor(4) atari_preproc = AtariPreprocessor() preprocessor = PreprocessorSequence(atari_preproc, history_preproc) #make the replay memory memory = ReplayMemory() #make the policy policy = LinearDecayGreedyEpsilonPolicy(0, 0, 6, 0.8, 0.05, 100000) #take the gamma, nicely gamma = 0.99 #target_update_freq target_update_freq = 10000 #num_burn_in : DUMMY num_burn_in = 10 #train_freq : DUMMY train_freq = 10
learning_rate = 1e-4 epsilon = 0.05 num_training_samples = int(5e6) buffer_size = int(1e6) target_update_freq = int(1e4) batch_size = 32 num_burn_in = int(5e4) train_freq = 1 nA = env.action_space.n # create preprocessor class preprocessor = AtariPreprocessor(84) print('created preprocessor') # create replay buffer replay_buffer = ReplayMemory(buffer_size, history_length, 84) print('created replay buffer') # create DQN agent agent = DQNAgent(DQN, preprocessor, replay_buffer, policy.GreedyEpsilonPolicy, gamma, target_update_freq, num_burn_in, train_freq, batch_size, history_length, nA, dtype, epsilon, model_name) print('create DQN agent') if mode == 'train': env = wrappers.Monitor(env, '/tmp/SpaceInvaders-DQN-expt-train.' + model_name, force=True) agent.fit(env, num_training_samples)
class LearnerDQN: ''' Learner class - abstraction which includes configuration of experiment, necessary models, and all actions needed for conduction. ''' def __init__(self, clip_grad=True, num_episodes=50, trajectory_len=MAX_STEPS, custom_func=None, custom_func_args=None ): ''' Initialization :param clip_grad: bool: flag for clipping gradients with value 1 :param num_episodes: number of episodes to run :param trajectory_len: maximal number of steps in each trajectory :param custom_func: custom reward function :param custom_func_args: custom reward function arguments ''' self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.build_nn() self.num_episodes = num_episodes self.trajectory_len = trajectory_len self.model = DQN() self.replay = ReplayMemory(10000) self.steps_done = 0 self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.clip_grad=clip_grad self.rewards = [] self.modules = [] self.env = ChainAgent(inventory_level=10, fix_delay=1, max_num_steps=MAX_STEPS + 10, demand_generation_function=self.demand_generation_function, custom_func=custom_func, custom_func_args=custom_func_args) def select_action(self, state): ''' Implementation of e-greedy approach :param state: input state to choose appropriate action :return: Tensor: action ''' state = torch.Tensor(state)[None, :] sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * self.steps_done / EPS_DECAY) self.steps_done += 1 if sample > eps_threshold: # print('greedy') with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1) else: # print('random') return torch.tensor([[random.randrange(N_ACTIONS)]], device=self.device, dtype=torch.long) def demand_generation_function(self): ''' Default function to generate demand :return: int: demand level ''' return np.random.randint(0, 10) def optimize_model(self): ''' Method of optimizing parameters of neural net :return: ''' if len(self.replay) < BATCH_SIZE: return transitions = self.replay.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) state_action_values = self.policy_net(state_batch).gather(1, action_batch) next_state_values = torch.zeros(BATCH_SIZE, device=self.device) # next_state_values = self.target_net(next_states).max(1)[0].detach() next_state_values = self.policy_net(next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() if self.clip_grad: for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def build_nn(self): ''' Building torch graph :return: ''' self.policy_net = DQN().to(self.device) # self.target_net = DQN().to(self.device) # self.target_net.load_state_dict(self.policy_net.state_dict()) # self.target_net.eval() def get_stat(self, state, next, reward, action): ''' Some visualisation :param state: :param next: :param reward: :param action: :return: ''' print('=====') print('DEM: ', self.env.demand_next, 'ST: ', state, ' -> ', action) print('NXST: ', next, 'REW: ', reward) print('=====') print() def run(self): ''' Main loop of training. Iterates over num_episodes * trajectory len steps :return: ''' for i_episode in range(self.num_episodes): state = self.env.reset() rewards = 0 for step in range(self.trajectory_len): action = self.select_action(torch.Tensor(state)) next_state, reward, done, _ = self.env.step(action.item()) reward *= 1. rewards += reward reward = torch.tensor([reward], device=self.device) self.replay.push(torch.Tensor([state]), action, torch.Tensor([next_state]), reward) state = next_state self.optimize_model() if done: break self.rewards.append(rewards) if i_episode % TARGET_UPDATE == 0: print(i_episode, ' : ', np.array(self.rewards[-100:]).mean())
def fit_nash(self, env: VehicleFollowingENV, num_iterations, episode, total_step, max_episode_length=None): """ Fit with Nash Equilibrium """ # RL network: LSTM self.p1_net = self.net # target network self.p1_net2 = self.net2 self.p2_net = deepcopy(self.net) self.p2_net2 = deepcopy(self.net2) # SL network: NN self.p1_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS) self.p2_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS) self.p1_policy.compile('Adam', categorical_crossentropy) self.p2_policy.compile('Adam', mse) # ReplayMemory self.p1_RL_mem = ReplayMemory(max_size=100000) self.p2_RL_mem = ReplayMemory(max_size=100000) self.p1_SL_mem = ReplayMemory(max_size=100000) self.p2_SL_mem = ReplayMemory(max_size=100000) # MainLoop state = env.reset() total_reward = 0 done = False for i in num_iterations: total_step += 1 # if self.render: # env.render() if max_episode_length and i > max_episode_length: break if np.random.random() < ETA: best_response = True else: best_response = False if best_response: p1_action = self.select_action(state, net=self.p1_net) p2_action = self.select_action(state, net=self.p2_net) else: p1_action = self.select_action(state, net=self.p1_policy) p2_action = self.select_action(state, net=self.p2_policy) next_state, reward, done = env.step(action_weight=p1_action, action_attacker=p2_action) self.p1_RL_mem.append((state, p1_action, RC - reward, next_state, done)) self.p2_RL_mem.append((state, p2_action, reward, next_state, done)) self.p1_SL_mem.append((state, p1_action)) self.p2_SL_mem.append((state, p2_action)) total_reward += reward if done: with open(self.algorithm + 'total_reward.txt', 'a') as f: f.write('Episode ({}), reward: ({})\n'.format(episode, total_reward)) print("Episode finished after {} time steps, total_reward is {}...".format(i, total_reward)) break if total_step % self.renew == 0 and total_step != 0: self.p1_net2 = self.p1_net self.p2_net2 = self.p2_net # if total_step % 100000 == 0: # self.save(total_step) if total_step >= self.burn_in and total_step % self.train_freq == 0: batches = min(self.batch_size, len(self.p1_RL_mem)) p1_states, p1_actions, p1_q_values = self.sample_from_Replay_Memory(batches, self.p1_RL_mem, self.p1_net) p2_states, p2_actions, p2_q_values = self.sample_from_Replay_Memory(batches, self.p2_RL_mem, self.p2_net) self.p1_net.fit(p1_states, p1_q_values) self.p2_net.fit(p2_states, p2_q_values) self.p1_policy.fit(p1_states, p1_actions) self.p2_policy.fit(p2_states, p2_actions) state = next_state return total_step, done
class DQNAgent: """ Class implementing DQN. This is a basic outline of the functions/parameters you will need in order to implement the DQNAgnet. This is just to get you started. You may need to tweak the parameters, add new ones, etc. Parameters ---------- q_network: keras.models.Model Your Q-network model. preprocessor: deeprl_hw2.core.Preprocessor The preprocessor class. See the associated classes for more details. memory: deeprl_hw2.core.Memory Your replay memory. gamma: float Discount factor. target_update_freq: float Frequency to update the target network. You can either provide a number representing a soft target update (see utils.py) or a hard target update (see utils.py and Atari paper.) num_burn_in: int Before you begin updating the Q-network your replay memory has to be filled up with some number of samples. This number says how many. train_freq: int How often you actually update your Q-Network. Sometimes stability is improved if you collect a couple samples for your replay memory, for every Q-network update that you run. batch_size: int How many samples in each minibatch. """ def __init__(self, q_network, q_network2, preprocessor: Preprocessor(), RLmemory: ReplayMemory(), SLmemory: ReplayMemory(), policy, gamma, target_update_freq, num_burn_in, train_freq, batch_size, algorithm='DoubuleDQN', render=False): self.net = q_network self.net2 = q_network2 self.pre = preprocessor self.rl_mem = RLmemory self.sl_mem = SLmemory self.policy = policy self.gamma = gamma self.renew = target_update_freq self.burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size self.algorithm = algorithm self.render = render def create_SL_model(self, state_shape, num_actions): model = Sequential() model.add(Dense(32, input_shape=state_shape, activation='ReLU')) model.add(Dense(num_actions, activation='softmax')) return model def compile(self, optimizer, loss_func): """Setup all of the TF graph variables/ops. This is inspired by the compile method on the keras.models.Model class. This is a good place to create the target network, setup your loss function and any placeholders you might need. You should use the mean_huber_loss function as your loss_function. You can also experiment with MSE and other losses. The optimizer can be whatever class you want. We used the keras.optimizers.Optimizer class. Specifically the Adam optimizer. """ self.net.compile(optimizer=optimizer, loss=loss_func) self.net2.compile(optimizer=optimizer, loss=loss_func) def calc_q_values(self, state, net): """Given a state (or batch of states) calculate the Q-values. Basically run your network on these states. Return ------ Q-values for the state(s) """ # with tf.Session() as f: # print(state.eval()) q_value = net.predict(state, steps=32) return q_value def select_action(self, state, net, process='training'): """Select the action based on the current state. You will probably want to vary your behavior here based on which stage of training your in. For example, if you're still collecting random samples you might want to use a UniformRandomPolicy. If you're testing, you might want to use a GreedyEpsilonPolicy with a low epsilon. If you're training, you might want to use the LinearDecayGreedyEpsilonPolicy. This would also be a good place to call process_state_for_network in your preprocessor. Returns -------- selected action """ assert process in ['sampling', 'testing', 'training'], 'Unsupported process.' epsilon = 0.1 start_value = 1 end_value = 0.1 num_steps = 10 ** 6 q_values = self.calc_q_values(state, net) if process == 'sampling': action = UniformRandomPolicy(len(q_values)).select_action() elif process == 'testing': action = GreedyEpsilonPolicy(epsilon).select_action(q_values) else: action = LinearDecayGreedyEpsilonPolicy(start_value, end_value, num_steps).select_action(q_values) return action def fit(self, env, num_iterations, max_episode_length=None): """Fit your model to the provided environment. Its a good idea to print out things like loss, average reward, Q-values, etc to see if your agent is actually improving. You should probably also periodically save your network weights and any other useful info. This is where you should sample actions from your network, collect experience samples and add them to your replay memory, and update your network parameters. Parameters ---------- env: VehicleFollowingEnv This is your Atari environment. You should wrap the environment using the wrap_atari_env function in the utils.py num_iterations: int How many samples/updates to perform. max_episode_length: int How long a single episode should last before the agent resets. Can help exploration. """ # state = self.pre.process_state_for_memory(env.reset()) #函数内容是pass state = env.reset() # 获取初始状态 tmp = 0 prev_action = np.zeros(4) # 初始前操作 states = [state] state_ = np.zeros(4) for i in range(num_iterations): # env.render() if max_episode_length and i > max_episode_length: break if state_.all() <= 0: action = np.random.random(4) # 初始状态产生随机权重 else: # state_ = tf.squeeze(state_) # state_ = tf.reshape(state_, 1) action = self.select_action(state_, process='testing') # print(action) # print('action', action) next_state, reward, done = env.step(action) if done: print("Episode finished after {} timesteps".format(i + 1)) break # next_state = self.pre.process_state_for_memory(next_state) states.append(next_state) tmp += 1 self.rl_mem.append(state, prev_action, reward, next_state, done) # if tmp >= 6: # # frames = states[-5:-1] # # frames2 = states[-4:] # # state_ = tf.concat([tf.expand_dims(i, 2) for i in frames], 2) # # next_state_ = tf.concat([tf.expand_dims(i, 2) for i in frames2], 2) # print(state, next_state) # # states = states[-5:] prev_action = action if i % self.renew == 0 and i != 0: self.net2 = self.net if i != 0 and i % self.train_freq == 0: print('{}th iteration, {}th train starts.'.format(i, i // self.train_freq)) batches = min(self.batch_size, len(self.rl_mem)) current_states = [] q_values = [] for samples in self.rl_mem.sample(batches): current_state, action, reward, next_state, is_done = [samples.state, samples.action, samples.reward, samples.next_state, samples.done] # state = tf.reshape(tf.squeeze(current_state), 4) # next_state = tf.reshape((tf.squeeze(current_state)), 4) current_states.append(state) target = reward if not is_done: if self.algorithm == 'NDQN': target = reward + self.gamma * np.amax(self.net2.predict(next_state, steps=32)[0]) elif self.algorithm == 'DQN': target = reward + self.gamma * np.amax(self.net.predict(next_state, steps=32)[0]) elif self.algorithm == 'DoubleDQN': target = reward # TODO elif self.algorithm == 'DuelingDQN': target = reward print(state) target_f = self.net.predict(states[-10:], steps=32) print(len(target_f)) print(action) target_f[action] = target q_values.append(target_f) # current_states = tf.reshape(current_states, 4) q_values = np.reshape((q_values), (-1, 6)) print(current_states.shape, q_values.shape) self.net.fit(current_states, q_values, steps_per_epoch=self.batch_size) def fit_nash(self, env: VehicleFollowingENV, num_iterations, episode, total_step, max_episode_length=None): """ Fit with Nash Equilibrium """ # RL network: LSTM self.p1_net = self.net # target network self.p1_net2 = self.net2 self.p2_net = deepcopy(self.net) self.p2_net2 = deepcopy(self.net2) # SL network: NN self.p1_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS) self.p2_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS) self.p1_policy.compile('Adam', categorical_crossentropy) self.p2_policy.compile('Adam', mse) # ReplayMemory self.p1_RL_mem = ReplayMemory(max_size=100000) self.p2_RL_mem = ReplayMemory(max_size=100000) self.p1_SL_mem = ReplayMemory(max_size=100000) self.p2_SL_mem = ReplayMemory(max_size=100000) # MainLoop state = env.reset() total_reward = 0 done = False for i in num_iterations: total_step += 1 # if self.render: # env.render() if max_episode_length and i > max_episode_length: break if np.random.random() < ETA: best_response = True else: best_response = False if best_response: p1_action = self.select_action(state, net=self.p1_net) p2_action = self.select_action(state, net=self.p2_net) else: p1_action = self.select_action(state, net=self.p1_policy) p2_action = self.select_action(state, net=self.p2_policy) next_state, reward, done = env.step(action_weight=p1_action, action_attacker=p2_action) self.p1_RL_mem.append((state, p1_action, RC - reward, next_state, done)) self.p2_RL_mem.append((state, p2_action, reward, next_state, done)) self.p1_SL_mem.append((state, p1_action)) self.p2_SL_mem.append((state, p2_action)) total_reward += reward if done: with open(self.algorithm + 'total_reward.txt', 'a') as f: f.write('Episode ({}), reward: ({})\n'.format(episode, total_reward)) print("Episode finished after {} time steps, total_reward is {}...".format(i, total_reward)) break if total_step % self.renew == 0 and total_step != 0: self.p1_net2 = self.p1_net self.p2_net2 = self.p2_net # if total_step % 100000 == 0: # self.save(total_step) if total_step >= self.burn_in and total_step % self.train_freq == 0: batches = min(self.batch_size, len(self.p1_RL_mem)) p1_states, p1_actions, p1_q_values = self.sample_from_Replay_Memory(batches, self.p1_RL_mem, self.p1_net) p2_states, p2_actions, p2_q_values = self.sample_from_Replay_Memory(batches, self.p2_RL_mem, self.p2_net) self.p1_net.fit(p1_states, p1_q_values) self.p2_net.fit(p2_states, p2_q_values) self.p1_policy.fit(p1_states, p1_actions) self.p2_policy.fit(p2_states, p2_actions) state = next_state return total_step, done def sample_from_Replay_Memory(self, batches, ReplayMemory, Net): current_states = [] actions = [] q_values = [] for samples in ReplayMemory.sample(batches): state, action, reward, next_state, is_done = [samples.state, samples.action, samples.reward, samples.next_state, samples.done] next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0) current_states.append(state) actions.append(action) target = reward if not is_done: target = reward + self.gamma * np.amax(Net.predict(next_state)[0]) target_f = Net.predict(state)[0] target_f[action] = target q_values.append(target_f) current_states = np.reshape(current_states, (-1, DIM_STATES)) q_values = np.reshape((q_values), (-1, NUM_ACTIONS)) return current_states, actions, q_values def evaluate(self, env, num_episodes, max_episode_length=None): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ for i in range(num_episodes): total = 0 state = np.zeros(4) tmp = 0 prev_action = 0 states = [state] state_ = -1 while True: if max_episode_length and i > max_episode_length: break if state_ == -1: action = np.random.randint(6) else: action = self.select_action(state_) next_state, reward, done, _ = env.step(action) if tmp < 6: # next_state = self.pre.process_state_for_memory(next_state) states.append(next_state) tmp += 1 if tmp >= 6: # frames = states[-5:-1] # frames2 = states[-4:] # state_ = tf.concat([tf.expand_dims(i, 2) for i in frames], 2) # next_state_ = tf.concat([tf.expand_dims(i, 2) for i in frames2], 2) self.rl_mem.append(state, prev_action, reward, next_state, done) states.append(state) states = states[-5:] prev_action = action state = next_state total += reward print('Episode {}, total reward is {}'.format(i, total))
args.num_atoms, -10, 10) else: if args.dueling: q_net_builder = model.build_dueling_network else: q_net_builder = model.build_basic_network q_net = q_net_builder(args.num_frames, args.frame_size, train_env.num_actions, args.noisy_net, args.sigma0, args.net) q_net.cuda() agent = dqn.DQNAgent(q_net, args.double_dqn, train_env.num_actions) if args.noisy_net: train_policy = GreedyEpsilonPolicy(0, agent) else: train_policy = LinearDecayGreedyEpsilonPolicy(args.train_start_eps, args.train_final_eps, args.train_eps_num_steps, agent) eval_policy = GreedyEpsilonPolicy(args.eval_eps, agent) replay_memory = ReplayMemory(args.replay_buffer_size) replay_memory.burn_in(train_env, agent, args.burn_in_frames) evaluator = lambda logger: evaluate(eval_env, eval_policy, 10, logger) train(agent, train_env, train_policy, replay_memory, args.gamma, args.batch_size, args.num_iters, args.frames_per_update, args.frames_per_sync, args.frames_per_eval, evaluator, args.output)
args.net) q_net.cuda() agent = dqn.DQNAgent(q_net, args.double_dqn, train_env.num_actions) if args.noisy_net: train_policy = GreedyEpsilonPolicy(0, agent) else: train_policy = LinearDecayGreedyEpsilonPolicy( args.train_start_eps, args.train_final_eps, args.train_eps_num_steps, agent) eval_policy = GreedyEpsilonPolicy(args.eval_eps, agent) replay_memory = ReplayMemory(args.replay_buffer_size) replay_memory.burn_in(train_env, agent, args.burn_in_frames) evaluator = lambda logger: evaluate(eval_env, eval_policy, 10, logger) train(agent, train_env, train_policy, replay_memory, args.gamma, args.batch_size, args.num_iters, args.frames_per_update, args.frames_per_sync, args.frames_per_eval, evaluator, args.output)