def __init__(self, action_set, reward_function, prior_variance, noise_variance, num_iterations, feature_extractor): Agent.__init__(self, action_set, reward_function) self.prior_variance = prior_variance self.noise_variance = noise_variance self.num_iterations = num_iterations self.feature_extractor = feature_extractor # buffer is a dictionary of lists # the key is a feature-action pair self.buffer = {(f, a): [] for f in self.feature_extractor.feature_space for a in self.action_set} self.Q = { key: np.sqrt(self.prior_variance) * np.random.randn() for key in self.buffer.keys() }
def __init__(self, sim, brain, name="QLearn", train_every_nth=5, train_batch_size=32, max_experience=300000, exploration_period=10000, epsilon_final=0.015, discount_factor=0.99): Agent.__init__(self, name) self.sim = sim self.brain = brain self.train_every_nth = train_every_nth self.train_batch_size = train_batch_size self.epsilon_final = epsilon_final self.discount_factor = discount_factor self.max_experience = max_experience self.exploration_period = exploration_period self.actions_executed = 0 self.memory = []
def __init__(self, action_set, reward_function, prior_variance, noise_variance, feature_extractor, prior_network, num_ensemble, hidden_dims=[10, 10], learning_rate=5e-4, buffer_size=50000, batch_size=64, num_batches=100, starts_learning=5000, discount=0.99, target_freq=10, verbose=False, print_every=1, test_model_path=None): Agent.__init__(self, action_set, reward_function) self.prior_variance = prior_variance self.noise_variance = noise_variance self.feature_extractor = feature_extractor self.feature_dim = self.feature_extractor.dimension dims = [self.feature_dim] + hidden_dims + [len(self.action_set)] self.prior_network = prior_network self.num_ensemble = num_ensemble # number of models in ensemble self.index = np.random.randint(self.num_ensemble) # build Q network # we use a multilayer perceptron if test_model_path is None: self.test_mode = False self.learning_rate = learning_rate self.buffer_size = buffer_size self.batch_size = batch_size self.num_batches = num_batches self.starts_learning = starts_learning self.discount = discount self.timestep = 0 self.buffer = Buffer(self.buffer_size) self.models = [] for i in range(self.num_ensemble): if self.prior_network: ''' Second network is a prior network whose weights are fixed and first network is difference network learned i.e, weights are mutable ''' self.models.append( DQNWithPrior(dims, scale=np.sqrt( self.prior_variance)).to(device)) else: self.models.append(MLP(dims).to(device)) self.models[i].initialize() ''' prior networks weights are immutable so enough to keep difference network ''' self.target_nets = [] for i in range(self.num_ensemble): if self.prior_network: self.target_nets.append( DQNWithPrior(dims, scale=np.sqrt( self.prior_variance)).to(device)) else: self.target_nets.append(MLP(dims).to(device)) self.target_nets[i].load_state_dict( self.models[i].state_dict()) self.target_nets[i].eval() self.target_freq = target_freq # target nn updated every target_freq episodes self.num_episodes = 0 self.optimizer = [] for i in range(self.num_ensemble): self.optimizer.append( torch.optim.Adam(self.models[i].parameters(), lr=self.learning_rate)) # for debugging purposes self.verbose = verbose self.running_loss = 1. self.print_every = print_every else: self.models = [] self.test_mode = True if self.prior_network: self.models.append( DQNWithPrior(dims, scale=self.prior_variance)) else: self.models.append(MLP(dims)) self.models[0].load_state_dict(torch.load(test_model_path)) self.models[0].eval() self.index = 0