예제 #1
0
    def __init__(self, action_set, reward_function, prior_variance,
                 noise_variance, num_iterations, feature_extractor):
        Agent.__init__(self, action_set, reward_function)
        self.prior_variance = prior_variance
        self.noise_variance = noise_variance
        self.num_iterations = num_iterations
        self.feature_extractor = feature_extractor

        # buffer is a dictionary of lists
        # the key is a feature-action pair
        self.buffer = {(f, a): []
                       for f in self.feature_extractor.feature_space
                       for a in self.action_set}
        self.Q = {
            key: np.sqrt(self.prior_variance) * np.random.randn()
            for key in self.buffer.keys()
        }
예제 #2
0
 def __init__(self,
              sim,
              brain,
              name="QLearn",
              train_every_nth=5,
              train_batch_size=32,
              max_experience=300000,
              exploration_period=10000,
              epsilon_final=0.015,
              discount_factor=0.99):
     Agent.__init__(self, name)
     self.sim = sim
     self.brain = brain
     self.train_every_nth = train_every_nth
     self.train_batch_size = train_batch_size
     self.epsilon_final = epsilon_final
     self.discount_factor = discount_factor
     self.max_experience = max_experience
     self.exploration_period = exploration_period
     self.actions_executed = 0
     self.memory = []
예제 #3
0
 def __init__(self,
              sim,
              brain,
              name="QLearn",
              train_every_nth=5,
              train_batch_size=32,
              max_experience=300000,
              exploration_period=10000,
              epsilon_final=0.015,
              discount_factor=0.99):
   Agent.__init__(self, name)
   self.sim = sim
   self.brain = brain
   self.train_every_nth = train_every_nth
   self.train_batch_size = train_batch_size
   self.epsilon_final = epsilon_final
   self.discount_factor = discount_factor
   self.max_experience = max_experience
   self.exploration_period = exploration_period
   self.actions_executed = 0
   self.memory = []
예제 #4
0
    def __init__(self,
                 action_set,
                 reward_function,
                 prior_variance,
                 noise_variance,
                 feature_extractor,
                 prior_network,
                 num_ensemble,
                 hidden_dims=[10, 10],
                 learning_rate=5e-4,
                 buffer_size=50000,
                 batch_size=64,
                 num_batches=100,
                 starts_learning=5000,
                 discount=0.99,
                 target_freq=10,
                 verbose=False,
                 print_every=1,
                 test_model_path=None):
        Agent.__init__(self, action_set, reward_function)

        self.prior_variance = prior_variance
        self.noise_variance = noise_variance

        self.feature_extractor = feature_extractor
        self.feature_dim = self.feature_extractor.dimension

        dims = [self.feature_dim] + hidden_dims + [len(self.action_set)]

        self.prior_network = prior_network
        self.num_ensemble = num_ensemble  # number of models in ensemble

        self.index = np.random.randint(self.num_ensemble)

        # build Q network
        # we use a multilayer perceptron

        if test_model_path is None:
            self.test_mode = False
            self.learning_rate = learning_rate
            self.buffer_size = buffer_size
            self.batch_size = batch_size
            self.num_batches = num_batches
            self.starts_learning = starts_learning
            self.discount = discount
            self.timestep = 0

            self.buffer = Buffer(self.buffer_size)
            self.models = []
            for i in range(self.num_ensemble):
                if self.prior_network:
                    '''
                    Second network is a prior network whose weights are fixed
                    and first network is difference network learned i.e, weights are mutable
                    '''
                    self.models.append(
                        DQNWithPrior(dims, scale=np.sqrt(
                            self.prior_variance)).to(device))
                else:
                    self.models.append(MLP(dims).to(device))
                self.models[i].initialize()
            '''
            prior networks weights are immutable so enough to keep difference network
            '''
            self.target_nets = []
            for i in range(self.num_ensemble):
                if self.prior_network:
                    self.target_nets.append(
                        DQNWithPrior(dims, scale=np.sqrt(
                            self.prior_variance)).to(device))
                else:
                    self.target_nets.append(MLP(dims).to(device))
                    self.target_nets[i].load_state_dict(
                        self.models[i].state_dict())
                    self.target_nets[i].eval()

            self.target_freq = target_freq  #   target nn updated every target_freq episodes
            self.num_episodes = 0

            self.optimizer = []
            for i in range(self.num_ensemble):
                self.optimizer.append(
                    torch.optim.Adam(self.models[i].parameters(),
                                     lr=self.learning_rate))

            # for debugging purposes
            self.verbose = verbose
            self.running_loss = 1.
            self.print_every = print_every

        else:
            self.models = []
            self.test_mode = True
            if self.prior_network:
                self.models.append(
                    DQNWithPrior(dims, scale=self.prior_variance))
            else:
                self.models.append(MLP(dims))
            self.models[0].load_state_dict(torch.load(test_model_path))
            self.models[0].eval()
            self.index = 0