def fitness(self, x): # Returns fitness of a given individual. # To be implemented in subclasses N = math.floor(x[-4]) env = self.CHSH(self.n_questions, self.game_type, self.max_gates, reward_function=x[-2], anneal=True) if self.agent_type == BasicAgent: agent = BasicAgent(state_size=len(env.repr_state), action_size=len(self.ALL_POSSIBLE_ACTIONS), gamma=x[0], eps=x[1], eps_min=x[2], eps_decay=x[3], alpha=x[4], momentum=x[5], ALL_POSSIBLE_ACTIONS=self.ALL_POSSIBLE_ACTIONS, model_type=LinearModel) scaler = get_scaler(env, N, ALL_POSSIBLE_ACTIONS=ALL_POSSIBLE_ACTIONS) else: # transform actions to noncorellated encoding encoder = OneHotEncoder(drop='first', sparse=False) # transform data onehot = encoder.fit_transform(ALL_POSSIBLE_ACTIONS) onehot_to_action = dict() action_to_onehot = dict() for a, a_encoded in enumerate(onehot): onehot_to_action[str(a_encoded)] = a action_to_onehot[a] = str(a_encoded) HIDDEN_LAYERS = x[-3] agent = DQNAgent(state_size=env.state_size, action_size=len(ALL_POSSIBLE_ACTIONS), gamma=x[0], eps=x[1], eps_min=x[2], eps_decay=x[3], ALL_POSSIBLE_ACTIONS=self.ALL_POSSIBLE_ACTIONS, learning_rate=x[4], hidden_layers=len(HIDDEN_LAYERS), hidden_dim=HIDDEN_LAYERS, onehot_to_action=onehot_to_action, action_to_onehot=action_to_onehot) scaler = None game = Game(scaler, batch_size=x[-1]) game.evaluate_train(N, agent, env) fitness_individual = game.evaluate_test(agent, env) return fitness_individual
def __init__(self, **kwargs): args = kwargs.get('args') # Number of steps of training before training network's weights are # copied to target network (C) self.copy_steps = 10000 # Number of frames to be stacked for a state representation (m) self.stack_num = 4 # Number of times actions are to be repeated (k) self.repeat_action = 1 # Size of minibatch self.minibatch_size = 32 # Lower than this, epsilon is kept constant self.min_epsilon = 0.1 # Epsilon's starting value self.max_epsilon = 1.0 self.epsilon = 1.0 # Number of steps to anneal epsilon self.anneal_till = 1000000 # Discount factor self.discount = 0.99 # Variable that holds the current Environment self.environment = AtariEnvironment(args=args) self.action_space = self.environment.getPossibleActions() # For how long should the network observe before playing? self.observation_time_steps = 50000 # The network self.network = DQNAgent(self.action_space, self.discount, args) self.train_frequency = 4 self.record_frequency = 10000 # The current state of the environment (stacked) self.current_state = deque(maxlen=self.stack_num) self.current_state.append(self.environment.getObservation()) # Experience replay self.memory_limit = 50000 self.experience_replay = ExperienceReplay(self.memory_limit, (84, 84), self.minibatch_size, self.stack_num) # Maximum no-ops self.num_no_op = 0 self.max_no_op = 30 self.steps = 0 self.num_epochs = 120 self.train_steps_per_epoch = 250000 self.num_test_epochs = 10 self.test_steps_per_epoch = 1000
def __init__(self, env_name, dqn_variant='nature', mode='train'): """ Classic Control Class is defined for train and test of the classic control problems of gym :param env_name: environment name shows the gym name (e.g. CartPole) :param dqn_variant: DQN variant shows the different variants of DQN (e.g. Nature, Dueling, Double) :param mode: mode could be train or test """ self.env_name = env_name self.env = gym.make(self.env_name) self.action_size = self.env.action_space.n if dqn_variant == "nature_dqn": self.rl_agent = DQNAgent(self.action_size, environment_type='atari', mode=mode, min_replay_buffer_size=5000, update_target_network_after=5000) elif dqn_variant == "double_dqn": self.rl_agent = DoubleDQNAgent(self.action_size, environment_type='atari', mode=mode, min_replay_buffer_size=5000, update_target_network_after=5000) elif dqn_variant == "prioritized_dqn": self.rl_agent = PrioritizedDoubleDQNAgent( self.action_size, environment_type='atari', mode=mode, min_replay_buffer_size=5000, update_target_network_after=5000) self.save_model_frequency = 20 self.total_episode_counter = 0 self.total_action_counter = 0 self.episode_reward = 0 if LOGGING: reward_log_dir = 'logs/gradient_tape/' + dqn_variant + '_' + mode + '/' + current_time + 'reward' self.reward_writer = tf.summary.create_file_writer(reward_log_dir) self.reward_metric = DQNMetric() if mode == 'train': self.train() elif mode == 'test': self.test()
) # agents agents = {} agents['DQN'] = DQNAgent(num_of_clusters, num_of_clusters*6, learning_rate=0.01, reward_decay=0.9, # Epsilon greedy e_greedy_min=(0.0, 0.1), e_greedy_max=(0.2, 0.8), e_greedy_init=(0.1, 0.5), e_greedy_increment=(0.005, 0.01), e_greedy_decrement=(0.005, 0.001), history_size=50, dynamic_e_greedy_iter=25, reward_threshold=3, explore_mentor = 'LRU', replace_target_iter=100, memory_size=10000, batch_size=128, output_graph=False, verbose=0 ) for (name, agent) in agents.items(): print("-------------------- %s --------------------" % name) step = 0
epsi = [1.0] epsi_decay = [0.05, 0.01, 0.005, 0.001] reward = 0 XlearnRate = 0 Xdiscount = 0 Xepsi = 0 Xdec = 0 rewards = [] for j in learnRates: for k in discounts: for ep in epsi: for epdec in epsi_decay: params = [episode_count, j, k, ep, epdec] #params = [5000, 0.001,0.95, 1.1, 0.005] #params = [5000, 0.0005,0.99, 0.1] agent = DQNAgent(env.action_space, env.observation_space, params) agent._render = False rewardList, stepList = agent.train(env) rewards.append(rewardList) if reward < sum(rewards[-1]) / episode_count: reward = sum(rewards[-1]) / episode_count mytrainedAgent = agent XlearnRate = j Xdiscount = k Xepsi = ep Xdec = epdec plt.plot(low_pass(rewardList), label=("Test: " + str(params))) ## Train Agent
os.remove(f) # Config config = Config() config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Special Configuration config.SIGMA_INIT = 0.0 config.N_STEPS = 3 # Env env = PrepareAtariEnv(env_id, log_dir) # Agent agent = DQNAgent(config, env, log_dir, static_policy=False) # Begin Interaction & Learning episode_reward = 0 observation = env.reset() for frame_idx in tqdm(range(1, config.MAX_FRAMES+1)): # Prepare to explore eps = agent.epsilon_by_frame(frame_idx) # Explore or Exploit action = agent.get_action(observation, eps) agent.save_action(action, frame_idx) # Execute
class DQNController(object): def __init__(self, **kwargs): args = kwargs.get('args') # Number of steps of training before training network's weights are # copied to target network (C) self.copy_steps = 10000 # Number of frames to be stacked for a state representation (m) self.stack_num = 4 # Number of times actions are to be repeated (k) self.repeat_action = 1 # Size of minibatch self.minibatch_size = 32 # Lower than this, epsilon is kept constant self.min_epsilon = 0.1 # Epsilon's starting value self.max_epsilon = 1.0 self.epsilon = 1.0 # Number of steps to anneal epsilon self.anneal_till = 1000000 # Discount factor self.discount = 0.99 # Variable that holds the current Environment self.environment = AtariEnvironment(args=args) self.action_space = self.environment.getPossibleActions() # For how long should the network observe before playing? self.observation_time_steps = 50000 # The network self.network = DQNAgent(self.action_space, self.discount, args) self.train_frequency = 4 self.record_frequency = 10000 # The current state of the environment (stacked) self.current_state = deque(maxlen=self.stack_num) self.current_state.append(self.environment.getObservation()) # Experience replay self.memory_limit = 50000 self.experience_replay = ExperienceReplay(self.memory_limit, (84, 84), self.minibatch_size, self.stack_num) # Maximum no-ops self.num_no_op = 0 self.max_no_op = 30 self.steps = 0 self.num_epochs = 120 self.train_steps_per_epoch = 250000 self.num_test_epochs = 10 self.test_steps_per_epoch = 1000 def __anneal_epsilon__(self): self.epsilon = max( self.epsilon - ((self.max_epsilon - self.min_epsilon) / self.anneal_till), self.min_epsilon) return def __sample_epsilon_action__(self): action = None if random.random() < self.epsilon: action = self.environment.sampleRandomAction() else: # Use the current state of the emulator and predict an action which gets # added to replay memory (use playing_network) q_values = self.network.predict( self.experience_replay.getCurrentState()) action = np.argmax(q_values, axis=1)[0] return action def __supply_action_to_environment__(self, action): self.environment.performAction(action) # Add current state, action, reward, consequent state to experience replay self.experience_replay.add( (self.environment.getObservation(), action, self.environment.getReward(), self.environment.isTerminalState())) return def __observe__(self): observe_start = time.time() for _ in xrange(self.observation_time_steps): action = self.environment.sampleRandomAction() self.__supply_action_to_environment__(action) observe_duration = time.time() - observe_start logger.info('Finished observation. Steps=%d; Time taken=%.2f', self.observation_time_steps, observe_duration) def isGameOver(self): if self.environment.isTerminalState(): self.environment.reset() for _ in xrange(self.stack_num): action = self.environment.sampleRandomAction() self.__supply_action_to_environment__(action) def run(self): """This method will be called from the main() method.""" # Observe the game by randomly sampling actions from the environment # and performing those actions self.__observe__() for i in xrange(self.num_epochs): self.environment.resetStatistics() time_now = time.time() for j in xrange(self.train_steps_per_epoch): # Get action using epsilon-greedy strategy action = self.__sample_epsilon_action__() # Perform action based on epsilon-greedy search and store the transitions # in experience replay self.__supply_action_to_environment__(action) # If the environment is in the terminal state, reset the environment, and # perform self.stack_num actions to reset the environment self.isGameOver() if j % self.train_frequency == 0: # print "Started training" # Sample minibatch of size self.minibatch_size from experience replay minibatch = self.experience_replay.sample() minibatch_states, minibatch_action, minibatch_reward, minibatch_next_states, \ minibatch_terminals = minibatch cost = self.network.train_network(minibatch_states, minibatch_action, minibatch_reward, minibatch_terminals, minibatch_next_states) if j % self.record_frequency == 0: total_score, num_games = self.environment.getStatistics() avg_score = total_score / num_games self.network.record_average_qvalue( self.experience_replay.getCurrentState(), i * self.train_steps_per_epoch + j, self.epsilon, avg_score) # Epsilon annealing self.__anneal_epsilon__() # if self.time_step % 1000 == 0: # print "Cost at iteration", self.time_step, " is", cost # print "Value of epsilon is", self.epsilon self.steps += 1 if j % self.copy_steps == 0: self.network.copy_weights() total_score, num_games = self.environment.getStatistics() time_taken = (time.time() - time_now) logger.info("Finished epoch %d: Steps=%d; Time taken=%.2f", i, j, time_taken) logger.info("\tNumber of games: %d; Average reward: %.2f", num_games, (total_score / num_games)) logger.info("\tFinal epsilon value for epoch: %f", self.epsilon) self.network.create_checkpoint() def run_testing_stage(self): for i in xrange(self.num_test_epochs): self.environment.resetStatistics() for _ in xrange(self.stack_num): action = self.environment.sampleRandomAction() self.__supply_action_to_environment__(action) for j in xrange(self.test_steps_per_epoch): q_values = self.network.predict( self.experience_replay.getCurrentState()) action = np.argmax(q_values, axis=1)[0] self.environment.performAction(action)
from agents.DQNAgent import DQNAgent from utils.Environments import DiscEnv import matplotlib.pyplot as plt import numpy as np num_act = 3 input_dim = 1 batch_size = 100 sender = DQNAgent(input_dim=input_dim, num_actions=num_act, batch_size=batch_size, lr=0.01, eps_start=0.99, intermed_nodes=num_act, eps_min=0.01, eps_dec=5e-5, capacity=7000) #5 receiver = DQNAgent(input_dim=input_dim, num_actions=num_act, batch_size=batch_size, lr=0.01, eps_start=0.99, intermed_nodes=num_act, eps_min=0.01, eps_dec=5e-5, capacity=7000) env = DiscEnv(num_obs=num_act, num_actions=num_act) returns = [] print("sender action probabilities") for s in range(num_act):
eval_env = DoudizhuEnv(config) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # initialize random agent for evaluation random_agent = RandomAgent(action_num=eval_env.action_num) rule_agent = DouDizhuRuleAgentV1() # initialize DQN agents dqn_agents = [] for i in range(env.player_num): dqn_agents.append(DQNAgent(num_actions=env.action_num, state_shape=env.state_shape, lr=.000001, use_conv=True, dueling=False, soft_update=True)) env.set_agents(dqn_agents) eval_env.set_agents([dqn_agents[0], rule_agent, rule_agent]) print(dqn_agents[0].q_net) eval_every = 500 eval_num = 1000 episode_num = 100_000 log_dir = './experiments/dqn_conv/' logger = Logger(log_dir) save_dir = './experiments/dqn_conv/models'
evaluation_tactic = [[1, 0, 0, 1], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]] max_gates = 10 round_to = 3 env = Environment(n_questions, evaluation_tactic, max_gates, ) # (state_size, action_size, gamma, eps, eps_min, eps_decay, alpha, momentum) # agent = BasicAgent(state_size=len(env.repr_state), action_size=len(ALL_POSSIBLE_ACTIONS), gamma=0.1, eps=1, eps_min=0.01, # eps_decay=0.9998, alpha=0.001, momentum=0.9, ALL_POSSIBLE_ACTIONS=ALL_POSSIBLE_ACTIONS, model_type=LinearModel) hidden_dim = [len(env.repr_state), len(env.repr_state) // 2] # agent = DQNAgent(state_size=len(env.repr_state), action_size=len(ALL_POSSIBLE_ACTIONS), gamma=0.1, eps=1, eps_min=0.01, eps_decay=0.9998, ALL_POSSIBLE_ACTIONS=ALL_POSSIBLE_ACTIONS, learning_rate=0.001, hidden_layers=len(hidden_dim), hidden_dim=hidden_dim) # scaler = get_scaler(env, N, ALL_POSSIBLE_ACTIONS, round_to=round_to) batch_size = 128 # store the final value of the portfolio (end of episode) game = Game(round_to=round_to) portfolio_value, rewards = game.evaluate_train(N, agent, env) # plot relevant information NonLocalGame.show_plot_of(rewards, "reward") if agent.model.losses is not None: NonLocalGame.show_plot_of(agent.model.losses, "loss")
def DQN_Exploration(args, log_dir, device, initial_state): env = NqubitEnvDiscrete(args.nbit, initial_state) # env.get_easy_T() remained to do agent = DQNAgent(args, env, log_dir, device) writer = SummaryWriter(log_dir) Temp = args.Temp totalstep = 0 epsilon = 1.0 obs = env.reset() print('initial_reward{0}'.format(env.get_current_threshold(obs))) for episode in tqdm(range(args.num_episodes)): Temp = Temp * 10.0**(-0.1) obs = env.reset() for step in tqdm(range(args.episode_length)): # choose large stepsize action number action = agent.get_action(obs, epsilon) # aciton <class 'int'> # execute large stepsize number if it satisfies the strong constraint next_obs, reward, done, info = env.step(obs, action, args.action_delta) #agent.buffer.push((obs, action, reward, next_obs)) # judge the large action stepsize effect # if ep = 0 : large stepsize is useless ep, action_delta = agent.prob(obs, next_obs, action) accept_probability = 1 if (ep > 0) else np.exp(ep / Temp) u = random.random() if u <= accept_probability: # take a small stepsize #agent.buffer.push((obs, action, reward, next_obs)) next_obs, reward, done, info = env.step( obs, action, action_delta) else: # No operation, the transition will be (obs, 0, reward, obs) action = 0 next_obs, reward, done, info = env.step( obs, action, action_delta) # record writer.add_scalar('threshold_rew', reward, totalstep) agent.buffer.push((obs, action, reward, next_obs)) if (totalstep > args.learn_start_steps) and ( totalstep % args.update_freq == 0): loss = agent.update() writer.add_scalar('loss', loss, totalstep) epsilon = agent.epsilon_by_step(totalstep) if epsilon < args.epsilon_min: epsilon = args.epsilon_min obs = next_obs totalstep += 1 if (reward >= -1.0): return reward, obs # Test_DQN_Agent if (totalstep % args.test_freq == 0): test_epsilon = 0.0 test_obs = env.reset() #T = env.get_easy_T(args.nbits) reward_recorder = -2.0 obs_recorder = test_obs for step in range(args.test_step): test_action = agent.get_action(test_obs, test_epsilon) # execute large stepsize number test_next_obs, reward, done, info = env.step( test_obs, test_action, args.action_delta) # judge the large action stepsize effect ep, action_delta = agent.prob(test_obs, test_next_obs, test_action) accept_probability = 1 if (ep > 0) else np.exp(ep / Temp) u = random.random() if u <= accept_probability: # take a small stepsize test_next_obs, reward, done, info = env.step( test_obs, test_action, action_delta) else: action = 0 test_next_obs = test_obs reward = env.get_current_threshold(test_obs) if reward > reward_recorder: reward_recorder = reward obs_recorder = test_next_obs if (reward >= -1.0): return reward, test_obs agent.buffer.push( (test_obs, action, reward, test_next_obs)) test_obs = test_next_obs writer.add_scalar('test_max_reward', reward_recorder, totalstep) writer.add_scalars( 'solution', { 's0': obs_recorder[0], 's1': obs_recorder[1], 's2': obs_recorder[2], 's3': obs_recorder[3], 's4': obs_recorder[4], 's5': obs_recorder[5] }, totalstep)
def __init__(self, # these are the parameters in nfsp paper scope, num_actions, state_shape, sl_lr=.005, rl_lr=.1, batch_size=256, train_every=128, sl_memory_init_size=1000, sl_memory_size=int(2e6), q_train_every=128, epsilon_decay_steps=int(1e5), eta=.2, gamma=.99, device=None): if device is None: self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') else: self.device = device self.scope = scope self.num_actions = num_actions self.state_shape = state_shape self.rl_lr = rl_lr self.sl_lr = sl_lr self.batch_size = batch_size self.train_every = train_every self.discount_factor = gamma self.sl_memory_init_size = sl_memory_init_size self.q_train_every = q_train_every self.anticipatory_param = eta self.device = device self.use_raw = False self.epsilon_decay_steps = epsilon_decay_steps self.epsilons = np.linspace(0.08, 0.0, epsilon_decay_steps) # self.epsilons = np.linspace(1.0, 0.1, epsilon_decay_steps) # average policy can be modeled as a Deep Q Network and we take softmax after final layer self.average_policy = AveragePolicyNet(state_shape=state_shape, num_actions=num_actions, use_conv=True,).to(self.device) self.average_policy.eval() # action value and target network are Deep Q Networks self.rl_agent = DQNAgent(state_shape=self.state_shape, num_actions=self.num_actions, lr=self.rl_lr, batch_size=128, train_every=64, epsilons=self.epsilons, ) # initialize optimizers """ in the paper: using sgd optim, eta = 0.1. rl_lr = 0.1, sl_lr = 0.005, epsilon decay from 0.06 to 0 """ self.sl_optim = torch.optim.Adam(self.average_policy.parameters(), lr=self.sl_lr) # initialize memory buffers self.sl_buffer = ReservoirMemoryBuffer(sl_memory_size, batch_size) # current policy self.policy = None self.softmax = torch.nn.Softmax(dim=1) self.timestep = 0 # for plotting self.loss = 0 self.actions = [] self.predictions = []
class NFSPAgent: """ Parameters: num_actions (int) : how many possible actions state_shape (list) : tensor shape of state sl_hidden_layers (list) : hidden layer sizes to use for average policy net for supervised learning rl_hidden_layers (list) : hidden layer sizes to use for best response net for reinforcement learning sl_lr (float) : learning rate to use for training average policy net rl_lr (float) : learning rate to use for training action value net batch_size (int) : batch sizes to use when training networks rl_memory_size (int) : max number of experiences to store in reinforcement learning memory buffer sl_memory_size (int) : max number of experiences to store in supervised learning memory buffer q_update_every (int) : how often to copy parameters to target network epsilons (list) : list of epsilon values to use over training period epsilon_decay_steps (int) : how often should we decay epsilon value eta (float) : anticipatory parameter for NFSP gamma (float) : discount parameter device (torch.device) : device to put models on """ def __init__(self, # these are the parameters in nfsp paper scope, num_actions, state_shape, sl_lr=.005, rl_lr=.1, batch_size=256, train_every=128, sl_memory_init_size=1000, sl_memory_size=int(2e6), q_train_every=128, epsilon_decay_steps=int(1e5), eta=.2, gamma=.99, device=None): if device is None: self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') else: self.device = device self.scope = scope self.num_actions = num_actions self.state_shape = state_shape self.rl_lr = rl_lr self.sl_lr = sl_lr self.batch_size = batch_size self.train_every = train_every self.discount_factor = gamma self.sl_memory_init_size = sl_memory_init_size self.q_train_every = q_train_every self.anticipatory_param = eta self.device = device self.use_raw = False self.epsilon_decay_steps = epsilon_decay_steps self.epsilons = np.linspace(0.08, 0.0, epsilon_decay_steps) # self.epsilons = np.linspace(1.0, 0.1, epsilon_decay_steps) # average policy can be modeled as a Deep Q Network and we take softmax after final layer self.average_policy = AveragePolicyNet(state_shape=state_shape, num_actions=num_actions, use_conv=True,).to(self.device) self.average_policy.eval() # action value and target network are Deep Q Networks self.rl_agent = DQNAgent(state_shape=self.state_shape, num_actions=self.num_actions, lr=self.rl_lr, batch_size=128, train_every=64, epsilons=self.epsilons, ) # initialize optimizers """ in the paper: using sgd optim, eta = 0.1. rl_lr = 0.1, sl_lr = 0.005, epsilon decay from 0.06 to 0 """ self.sl_optim = torch.optim.Adam(self.average_policy.parameters(), lr=self.sl_lr) # initialize memory buffers self.sl_buffer = ReservoirMemoryBuffer(sl_memory_size, batch_size) # current policy self.policy = None self.softmax = torch.nn.Softmax(dim=1) self.timestep = 0 # for plotting self.loss = 0 self.actions = [] self.predictions = [] def set_policy(self, policy=None): """ Set policy parameter Input : policy (str) : policy to use. sets according to anticipatory parameter on default. Output : None, sets policy parameter """ # set policy according to string if policy and policy in ['average_policy', 'best_response', 'greedy_average_policy']: self.policy = policy else: self.policy = 'best_response' if np.random.uniform() <= self.anticipatory_param else 'average_policy' return self.policy def ap_pick_action(self, state): """ Pick an action given a state using the average policy network Input: state (dict) 'obs' : actual state representation 'legal_actions' : possible legal actions to be taken from this state Output: action (int) : integer representing action id """ with torch.no_grad(): state_obs = torch.FloatTensor(state['obs']).unsqueeze(0).to(self.device) q_values = self.average_policy(state_obs)[0].cpu().detach().numpy() probs = remove_illegal(q_values, state['legal_actions']) action = np.random.choice(self.num_actions, p=probs) # print('sl: ', action) # print(q_values, action) return action, probs def greedy_ap_pick_action(self, state): """ Pick an action greedily given a state using the average policy network Input: state (dict) 'obs' : actual state representation 'legal_actions' : possible legal actions to be taken from this state Output: action (int) : integer representing action id """ with torch.no_grad(): state_obs = torch.FloatTensor(state['obs']).unsqueeze(0).to(self.device) q_values = self.average_policy(state_obs)[0].cpu().detach().numpy() probs = remove_illegal(q_values, state['legal_actions']) action = np.argmax(probs) return action, probs def step(self, state): """ Given state, produce actions to generate training data. Choose action according to set policy parameter. Input: state (dict) 'obs' : actual state representation 'legal_actions' : possible legal actions to be taken from this state Output: action (int) : integer representing action id """ if self.policy == 'average_policy': action = self.ap_pick_action(state)[0] elif self.policy == 'best_response': action = self.rl_agent.step(state) return action def eval_step(self, state): """ Pick an action given a state according to set policy. This is to be used during evaluation, so no epsilon greedy. Makes call to eval_pick_action or average_policy to actually select the action Input: state (dict) 'obs' : actual state representation 'legal_actions' : possible legal actions to be taken from this state Output: action (int) : integer representing action id probs (np.array) : softmax distribution over the actions """ if self.policy == 'average_policy': action, probs = self.ap_pick_action(state) elif self.policy == 'greedy_average_policy': action, probs = self.greedy_ap_pick_action(state) elif self.policy == 'best_response': action, probs = self.rl_agent.eval_step(state) self.actions.append(action) return action, probs def add_transition(self, transition): """" Add transition to our memory buffers and train the networks one batch. Input: transition (tuple) : tuple representation of a transition --> (state, action, reward, next state, done) Output: Nothing. Stores transition in the buffers, updates networks using memory buffers, and updates target network depending on what timestep we're at. """ state, action, reward, next_state, done = transition self.rl_agent.add_transition(transition) self.timestep += 1 if self.policy == 'best_response': # this version saving the predicted action from dqn_eval instead of the action that was taken by the agent. self.sl_buffer.add_sa(state['obs'], action) if len(self.sl_buffer.memory) >= self.sl_memory_init_size and self.timestep % self.train_every == 0: sl_loss = self.train_sl() print(f'\rAgent {self.scope}, step: {self.timestep}, sl_loss on batch: {sl_loss}', end='') # print(f'step: {self.timestep} average policy updated') def train_sl(self): """ Samples from supervised learning memory buffer and trains the average policy network one step. Input: Nothing. Draws sample from sl buffer to train the network Output: loss (float) : loss on training batch """ samples = self.sl_buffer.sample() states = [s[0] for s in samples] actions = [s[1] for s in samples] self.average_policy.train() self.sl_optim.zero_grad() # [batch, state_shape(450)] states = torch.FloatTensor(states).to(self.device) # [batch, 1] actions = torch.LongTensor(actions).to(self.device) #### optimizing the log-prob of past actions taken as in NFSP paper # [batch, action_num(309)] probs = self.average_policy(states) # [batch, 1] prob = probs.gather(1, actions.unsqueeze(1)).squeeze(1) # adding a small eps to torch.log(), avoiding nan in the log_prob eps = 1e-7 log_prob = torch.log(prob + eps) ### look into torch.nll_loss loss = -log_prob.mean() loss.backward() self.sl_optim.step() self.average_policy.eval() return loss.item() def save_state_dict(self, file_path): """ Save state dict for networks of NFSP agent Input: file_path (str) : string filepath to save agent at """ state_dict = dict() state_dict['average_policy'] = self.average_policy.state_dict() state_dict['dqn_net'] = self.rl_agent.q_net.state_dict() state_dict['dqn_target'] = self.rl_agent.target_net.state_dict() torch.save(state_dict, file_path) def load_from_state_dict(self, filepath): """ Load agent parameters from filepath Input: file_path (str) : string filepath to load parameters from """ state_dict = torch.load(filepath, map_location=self.device) self.average_policy.load_state_dict(state_dict['average_policy']) self.rl_agent.q_net.load_state_dict(state_dict['dqn_net']) self.rl_agent.target_net.load_state_dict(state_dict['dqn_target'])
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--type", choices=["Atari", "Classic"], help="Select the Type of Game from OpenAI gym", required=True) parser.add_argument("--name", help="Select the Name of Game eg. Breakout-v0", required=True) parser.add_argument("--mode", choices=["train", "test"], help="Choose to Train or Test", default="train", required=False) args = parser.parse_args() if args.type == "Classic": environment = BaseGymEnvironment(args.type, args.name) elif args.type == "Atari": environment = AtariGymEnvironment(args.type, args.name) input_shape = environment.observation_shape() nb_actions = environment.nb_actions() agent = DQNAgent(args.type, args.name, input_shape, nb_actions) if args.mode == "train": agent.learn(environment) else: agent.play(environment)