Esempio n. 1
0
    def test_empty_test(self):
        dataset = Dataset_Counts(state_shape=[2],
                                 nb_actions=3,
                                 count_param=0.2)
        dataset.add(*transitions[0])
        dataset.add(*transitions[1])

        dataset_train, dataset_test = dataset.train_validation_split(
            test_size=0.5)
        s_train, a_train, p_train, r_train, s2_train, t_train, _, _, _ = dataset_train._get_transition(
            0)
        s_test, a_test, p_test, r_test, s2_test, t_test, _, _, _ = dataset_test._get_transition(
            0)

        if a_train == transitions[0].a:
            trans_train = transitions[0]
            trans_test = transitions[1]
        else:
            trans_train = transitions[1]
            trans_test = transitions[0]

        assert_sequence_almost_equal(self, s_train, trans_train.s)
        self.assertEqual(a_train, trans_train.a)
        assert_sequence_almost_equal(self, p_train, trans_train.p)
        self.assertAlmostEqual(r_train, trans_train.r)
        self.assertEqual(t_train, trans_train.t)

        assert_sequence_almost_equal(self, s_test, trans_test.s)
        self.assertEqual(a_test, trans_test.a)
        assert_sequence_almost_equal(self, p_test, trans_test.p)
        self.assertAlmostEqual(r_test, trans_test.r)
        self.assertEqual(t_test, trans_test.t)
Esempio n. 2
0
 def setUp(self):
     self.dataset = Dataset_Counts(state_shape=[2],
                                   nb_actions=3,
                                   count_param=0.2)
     self.dataset_size = 100
     for i in np.random.randint(0, len(transitions), self.dataset_size):
         self.dataset.add(*transitions[i])
Esempio n. 3
0
def compute_counts(dataset, overwrite=False, count_param=0.2):
    """ Compute the pseudo-counts for each state-action pair present in the dataset following the methodology described in the paper.

    Args:
      dataset: the dataset instance for which to computed counts
      overwrite: whether to overwrite an existing counts file of the same size, generated with the same seed and same noise_factor.
    Returns:
      Saves the dataset augmented with the counts in /dataset/{dataset_size}/{seed}/{noise_factor}/counts_dataset.pkl.
    """

    full_path = os.path.join(dataset.path, dataset.dataset_folder, COUNTS_SUFFIX)
    if os.path.isfile(full_path):
        if overwrite:
            print("Found existing counts file. Overwriting.", flush=True)
            os.remove(full_path)
        else:
            print("Found existing counts file. Aborting.", flush=True)
            return

    t = time.time()
    print("Computing counts. The dataset contains {} transitions.".format(len(dataset.states)), flush=True)
    d = Dataset_Counts.from_dataset(dataset, count_param)
    print("Saving data with counts to {}".format(full_path), flush=True)
    d.save_dataset(full_path)
    print("Data with counts saved, {} samples".format(d.size), flush=True)
    print("Counts computed in " + str(time.time() - t) + " seconds", flush=True)
Esempio n. 4
0
class TestDatasetCounts(TestCase):
    def setUp(self):
        self.dataset = Dataset_Counts(state_shape=[2],
                                      nb_actions=3,
                                      count_param=0.2)
        for t in transitions:
            self.dataset.add(*t)

    def test_transition_to_same_state(self):
        s, a, _, r, s2, t, c, p, c1 = self.dataset._get_transition(0)
        self.assertSequenceAlmostEqual(s, transitions[0].s)
        self.assertEqual(a, transitions[0].a)
        self.assertAlmostEqual(r, transitions[0].r)
        self.assertSequenceAlmostEqual(s2, transitions[1].s)
        self.assertEqual(t, transitions[0].t)
        self.assertSequenceAlmostEqual(c, self.dataset.c[1])
        self.assertSequenceAlmostEqual(p, transitions[1].p)
        self.assertAlmostEqual(c1, 1.75)

    def test_transition_to_different_state(self):
        s, a, _, r, s2, t, c, p, c1 = self.dataset._get_transition(1)
        self.assertSequenceAlmostEqual(s, transitions[1].s)
        self.assertEqual(a, transitions[1].a)
        self.assertAlmostEqual(r, transitions[1].r)
        self.assertSequenceAlmostEqual(s2, transitions[2].s)
        self.assertEqual(t, transitions[1].t)
        self.assertSequenceAlmostEqual(c, self.dataset.c[2])
        self.assertSequenceAlmostEqual(c, [1, 0, 0])
        self.assertSequenceAlmostEqual(p, transitions[2].p)
        self.assertAlmostEqual(c1, 1)

    def test_terminal_transition(self):
        s, a, _, r, s2, t, c, p, c1 = self.dataset._get_transition(3)
        self.assertSequenceAlmostEqual(s, transitions[3].s)
        self.assertEqual(a, transitions[3].a)
        self.assertAlmostEqual(r, transitions[3].r)
        self.assertSequenceAlmostEqual(s2, [0, 0])
        self.assertEqual(t, True)
        self.assertSequenceAlmostEqual(c, [0, 0, 0])
        self.assertSequenceAlmostEqual(p, [0, 0, 0])
        self.assertAlmostEqual(c1, 1.75)

    def test_save_and_load(self):
        self.dataset.save_dataset('tmp.pickle')
        new_dataset = Dataset_Counts.load_dataset('tmp.pickle')
        self.assertSequenceAlmostEqual(self.dataset.a[0:self.dataset.size],
                                       new_dataset.a[0:self.dataset.size])
        self.assertSequenceAlmostEqual(self.dataset.t[0:self.dataset.size],
                                       new_dataset.t[0:self.dataset.size])
        self.assertSequenceAlmostEqual(self.dataset.r[0:self.dataset.size],
                                       new_dataset.r[0:self.dataset.size])
        for i in range(self.dataset.size):
            self.assertSequenceAlmostEqual(self.dataset.s[i], new_dataset.s[i])
            self.assertSequenceAlmostEqual(self.dataset.c[i], new_dataset.c[i])
        new_dataset.add(*transitions[0])
        import os
        os.remove('tmp.pickle')

    def assertSequenceAlmostEqual(self, it1, it2):
        assert_sequence_almost_equal(self, it1, it2)
Esempio n. 5
0
 def test_save_and_load(self):
     self.dataset.save_dataset('tmp.pickle')
     new_dataset = Dataset_Counts.load_dataset('tmp.pickle')
     self.assertSequenceAlmostEqual(self.dataset.a[0:self.dataset.size],
                                    new_dataset.a[0:self.dataset.size])
     self.assertSequenceAlmostEqual(self.dataset.t[0:self.dataset.size],
                                    new_dataset.t[0:self.dataset.size])
     self.assertSequenceAlmostEqual(self.dataset.r[0:self.dataset.size],
                                    new_dataset.r[0:self.dataset.size])
     for i in range(self.dataset.size):
         self.assertSequenceAlmostEqual(self.dataset.s[i], new_dataset.s[i])
         self.assertSequenceAlmostEqual(self.dataset.c[i], new_dataset.c[i])
     new_dataset.add(*transitions[0])
     import os
     os.remove('tmp.pickle')
Esempio n. 6
0
class TestSplittingDataset(TestCase):
    def setUp(self):
        self.dataset = Dataset_Counts(state_shape=[2],
                                      nb_actions=3,
                                      count_param=0.2)
        self.dataset_size = 100
        for i in np.random.randint(0, len(transitions), self.dataset_size):
            self.dataset.add(*transitions[i])

    def test_empty_test(self):
        dataset_train, dataset_test = self.dataset.train_validation_split(
            test_size=0)
        self.assertEqual(dataset_test.size, 0)
        self.assertEqual(dataset_train.size, self.dataset_size)

    def test_default(self):
        dataset_train, dataset_test = self.dataset.train_validation_split()
        self.assertEqual(dataset_test.size, 20)
        self.assertEqual(dataset_train.size, 80)

    def test_empty_train(self):
        dataset_train, dataset_test = self.dataset.train_validation_split(1)
        self.assertEqual(dataset_test.size, self.dataset_size)
        self.assertEqual(dataset_train.size, 0)

    def test_original_data_set_does_not_change(self):
        random_ind = np.random.randint(0, self.dataset_size)
        s, a, p1, r, s2, t, c, p2, c1 = self.dataset._get_transition(
            random_ind)
        _, _ = self.dataset.train_validation_split(np.random.rand())
        new_s, new_a, new_p1, new_r, new_s2, new_t, new_c, new_p2, new_c1 = self.dataset._get_transition(
            random_ind)

        assert_sequence_almost_equal(self, new_s, s)
        self.assertEqual(new_a, a)
        assert_sequence_almost_equal(self, new_p1, p1)
        self.assertAlmostEqual(new_r, r)
        assert_sequence_almost_equal(self, new_s2, s2)
        self.assertEqual(new_t, t)
        assert_sequence_almost_equal(self, new_c, c)
        assert_sequence_almost_equal(self, new_p2, p2)
        self.assertAlmostEqual(new_c1, c1)
Esempio n. 7
0
class TestDatasetCountsAdd(TestCase):
    def setUp(self):
        self.dataset = Dataset_Counts(state_shape=[2],
                                      nb_actions=3,
                                      count_param=0.2)

    def test_populate(self):
        self.assertEqual(self.dataset.size, 0)
        self.dataset.add(*transitions[0])
        self.assertEqual(self.dataset.size, 1)
        self.assertAlmostEqual(self.dataset.c[0][1], 1)

        self.dataset.add(s=[2, 2], a=2, r=0, t=False, p=[.2, .8, 0])
        self.assertEqual(self.dataset.size, 2)
        self.assertAlmostEqual(self.dataset.c[0][1], 1)
        self.assertAlmostEqual(self.dataset.c[0][2], 1)
        self.assertAlmostEqual(self.dataset.c[1][1], 1)
        self.assertAlmostEqual(self.dataset.c[1][2], 1)

        self.dataset.add(s=[10, 10], a=0, r=0, t=False, p=[.2, .8, 0])
        self.assertAlmostEqual(self.dataset.c[0][0], 0)
        self.assertAlmostEqual(self.dataset.c[0][1], 1)
        self.assertAlmostEqual(self.dataset.c[0][2], 1)
        self.assertAlmostEqual(self.dataset.c[1][1], 1)
        self.assertAlmostEqual(self.dataset.c[1][2], 1)
        self.assertAlmostEqual(self.dataset.c[2][0], 1)

        self.dataset.add(s=[2.03, 2.04], a=1, r=0, t=True, p=[.2, .8, 0])
        self.assertAlmostEqual(self.dataset.c[0][0], 0)
        self.assertAlmostEqual(self.dataset.c[0][1], 1.75)
        self.assertAlmostEqual(self.dataset.c[0][2], 1)
        self.assertAlmostEqual(self.dataset.c[1][0], 0)
        self.assertAlmostEqual(self.dataset.c[1][1], 1.75)
        self.assertAlmostEqual(self.dataset.c[1][2], 1)
        self.assertAlmostEqual(self.dataset.c[3][0], 0)
        self.assertAlmostEqual(self.dataset.c[3][1], 1.75)
        self.assertAlmostEqual(self.dataset.c[3][2], 0.75)
Esempio n. 8
0
def compute_counts(dataset, overwrite=False, param=0.2):
    """ Compute the pseudo-counts for each state-action pair present in the dataset following the methodology described in the paper.

  Args:
    dataset: the dataset instance for which to computed counts
    overwrite: whether to overwrite an existing counts file of the same size, generated with the same seed and same noise_factor.
  Returns:
    Saves the dataset augmented with the counts in /dataset/{dataset_size}/{seed}/{noise_factor}/counts_dataset.pkl.
  """

    full_path = os.path.join(dataset.path, dataset.dataset_folder,
                             COUNTS_SUFFIX)
    if os.path.isfile(full_path):
        if overwrite:
            print("Found existing counts file. Overwriting.", flush=True)
            os.remove(full_path)
        else:
            print("Found existing counts file. Aborting.", flush=True)
            return

    t = time.time()
    print("Computing counts. The dataset contains {} transitions.".format(
        len(dataset.states)),
          flush=True)

    data = {}
    data['s'] = np.zeros([len(dataset.states) - 1] + list(dataset.state_shape),
                         dtype='float32')
    data['s2'] = np.zeros([len(dataset.states) - 1] +
                          list(dataset.state_shape),
                          dtype='float32')
    data['a'] = np.zeros((len(dataset.states) - 1), dtype='int32')
    data['r'] = np.zeros((len(dataset.states) - 1), dtype='float32')
    data['t'] = np.zeros((len(dataset.states) - 1), dtype='bool')
    data['c'] = np.zeros((len(dataset.states) - 1, dataset.nb_actions),
                         dtype='float32')
    data['c1'] = np.zeros((len(dataset.states) - 1), dtype='float32')
    data['p'] = np.zeros((len(dataset.states) - 1, dataset.nb_actions),
                         dtype='float32')
    data['q'] = np.zeros((len(dataset.states) - 1, dataset.nb_actions),
                         dtype='float32')

    mean, std = dataset.counts_weights()
    for i in range(len(dataset.states) - 1):
        if i % 1000 == 999:
            print('{} samples processed'.format(i))
        data['s'][i] = dataset.states[i]
        data['a'][i] = dataset.actions[i]
        data['r'][i] = dataset.rewards[i]
        for j in range(len(dataset.states) - 1):
            if dataset.actions[i] == dataset.actions[j]:
                s = Dataset_Counts.similarite(dataset.states[i],
                                              dataset.states[j], param, mean,
                                              std)
                data['c1'][i] += s
        if dataset.terms[i]:
            data['t'][i] = True
        else:
            data['s2'][i] = dataset.states[i + 1]
            data['p'][i] = dataset.policy[i + 1]
            data['q'][i] = dataset.qfunction[i + 1]
            for j in range(len(dataset.states) - 1):
                s = Dataset_Counts.similarite(dataset.states[i + 1],
                                              dataset.states[j], param, mean,
                                              std)
                data['c'][i, dataset.actions[j]] += s

    print("Saving data with counts to {}".format(full_path), flush=True)
    with open(full_path, "wb") as f:
        pickle.dump(data, f)
    print("Data with counts saved, {} samples".format(len(data['s'])),
          flush=True)
    print("Counts computed in " + str(time.time() - t) + " seconds",
          flush=True)
Esempio n. 9
0
def run(domain, config, options):

    dir_path = os.path.dirname(os.path.realpath(__file__))
    if not config:
        config = 'config_' + domain
    cfg_file = os.path.join(dir_path, config + '.yaml')
    params = yaml.safe_load(open(cfg_file, 'r'))

    # replacing params with command line options
    for opt in options:
        assert opt[0] in params
        dtype = type(params[opt[0]])
        if dtype == bool:
            new_opt = False if opt[1] != 'True' else True
        else:
            new_opt = dtype(opt[1])
        params[opt[0]] = new_opt

    print('\n')
    print('Parameters ')
    for key in params:
        print(key, params[key])
    print('\n')

    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    random_state = np.random.RandomState(params['seed'])
    device = torch.device(params["device"])

    DATA_DIR = os.path.join(params['folder_location'], params['folder_name'])

    env = environment.Environment(domain, params, random_state)

    if params['batch']:
        from baseline import Baseline
        baseline_path = os.path.join(DATA_DIR, params['baseline_path'])
        baseline = Baseline(baseline_path,
                            params['network_size'],
                            state_shape=params['state_shape'],
                            nb_actions=params['nb_actions'],
                            seed=params['seed'],
                            temperature=params['baseline_temp'],
                            device=params['device'],
                            normalize=params['normalize'])

        dataset_path = os.path.join(DATA_DIR, params['dataset_path'])
        print("\nLoading dataset from file {}".format(dataset_path),
              flush=True)
        if not os.path.exists(dataset_path):
            raise ValueError("The dataset file does not exist")
        with open(dataset_path, "rb") as f:
            data = pickle.load(f)
        dataset = Dataset_Counts(data, params['count_param'])
        print("Data with counts loaded: {} samples".format(len(data['s'])),
              flush=True)
        folder_name = os.path.dirname(dataset_path)
        expt = BatchExperiment(
            dataset=dataset,
            env=env,
            folder_name=folder_name,
            episode_max_len=params['episode_max_len'],
            minimum_count=params['minimum_count'],
            extra_stochasticity=params['extra_stochasticity'],
            history_len=params['history_len'],
            max_start_nullops=params['max_start_nullops'])

    else:
        # Create experiment folder
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)

        baseline = None
        expt = DQNExperiment(env=env,
                             ai=None,
                             episode_max_len=params['episode_max_len'],
                             annealing=params['annealing'],
                             history_len=params['history_len'],
                             max_start_nullops=params['max_start_nullops'],
                             replay_min_size=params['replay_min_size'],
                             test_epsilon=params['test_epsilon'],
                             folder_name=DATA_DIR,
                             network_path=params['network_path'],
                             extra_stochasticity=params['extra_stochasticity'],
                             score_window_size=100)

    for ex in range(params['num_experiments']):
        print('\n')
        print('>>>>> Experiment ',
              ex,
              ' >>>>> ',
              params['learning_type'],
              ' >>>>> Epsilon >>>>> ',
              params['epsilon_soft'],
              ' >>>>> Minimum Count >>>>> ',
              params['minimum_count'],
              ' >>>>> Kappa >>>>> ',
              params['kappa'],
              ' >>>>> ',
              flush=True)
        print('\n')
        ai = AI(baseline,
                state_shape=env.state_shape,
                nb_actions=env.nb_actions,
                action_dim=params['action_dim'],
                reward_dim=params['reward_dim'],
                history_len=params['history_len'],
                gamma=params['gamma'],
                learning_rate=params['learning_rate'],
                epsilon=params['epsilon'],
                final_epsilon=params['final_epsilon'],
                test_epsilon=params['test_epsilon'],
                annealing_steps=params['annealing_steps'],
                minibatch_size=params['minibatch_size'],
                replay_max_size=params['replay_max_size'],
                update_freq=params['update_freq'],
                learning_frequency=params['learning_frequency'],
                ddqn=params['ddqn'],
                learning_type=params['learning_type'],
                network_size=params['network_size'],
                normalize=params['normalize'],
                device=device,
                kappa=params['kappa'],
                minimum_count=params['minimum_count'],
                epsilon_soft=params['epsilon_soft'])
        expt.ai = ai

        env.reset()
        with open(expt.folder_name + '/config.yaml', 'w') as y:
            yaml.safe_dump(params, y)  # saving params for reference
        expt.do_epochs(number_of_epochs=params['num_epochs'],
                       is_learning=params['is_learning'],
                       steps_per_epoch=params['steps_per_epoch'],
                       is_testing=params['is_testing'],
                       steps_per_test=params['steps_per_test'],
                       passes_on_dataset=params['passes_on_dataset'],
                       exp_id=ex)
Esempio n. 10
0
    def __init__(self, training_steps, validation_steps, validation_size,
                 mini_batch_size, learning_rate, number_of_epochs,
                 network_size, folder_location, dataset_file,
                 cloned_network_path, sample_from_env, entropy_coefficient,
                 device, seed, experiment_name, config_file,
                 update_learning_rate):

        self.sample_from_env = sample_from_env
        self.smaller_validation_loss = None
        self.seed = seed
        try:
            self.params = yaml.safe_load(open(config_file, 'r'))
        except FileNotFoundError as e:
            print(
                "Configuration file not found; Define a config_file to be able to sample from environment"
            )
            raise e

        # initialize seeds for reproducibility
        np.random.seed(seed)
        torch.manual_seed(seed)

        # set paths for data and output path
        log_path = os.path.join('./logs/' + experiment_name)
        data_dir = folder_location
        dataset_path = dataset_file
        self.output_folder = os.path.dirname(dataset_path)
        self.cloned_network_path = os.path.join(os.path.dirname(dataset_path),
                                                cloned_network_path)

        # start
        self.logger = SummaryWriter(log_path)

        # import data
        full_dataset = Dataset_Counts.load_dataset(dataset_path)
        self.dataset_train, self.dataset_validation = full_dataset.train_validation_split(
            test_size=validation_size)

        # set training parameters
        self.mini_batch_size = mini_batch_size
        self.number_of_epochs = number_of_epochs
        self.network_size = network_size
        self.entropy_coefficient = entropy_coefficient
        self.device = device
        self.learning_rate = learning_rate
        self.update_learning_rate = update_learning_rate

        if training_steps != 0:
            self.training_steps = training_steps
        else:
            self.training_steps = int(self.dataset_train.size /
                                      self.mini_batch_size)
        if validation_steps != 0:
            self.validation_steps = validation_steps
        else:
            self.validation_steps = int(self.dataset_validation.size /
                                        self.mini_batch_size)
        self.log_frequency = int(self.training_steps / 10)
        print(
            "Training with {} training steps and {} validation steps ".format(
                self.training_steps, self.validation_steps))

        # create model
        self.cloned_baseline_policy = ClonedBaseline(
            network_size=network_size,
            network_path=None,
            state_shape=self.params['state_shape'],
            nb_actions=self.params['nb_actions'],
            device=device,
            seed=seed,
            temperature=0)
        self.best_policy = ClonedBaseline(
            network_size=network_size,
            network_path=None,
            state_shape=self.params['state_shape'],
            nb_actions=self.params['nb_actions'],
            device=device,
            seed=seed,
            temperature=0,
            results_folder=self.output_folder)
        self.best_policy._copy_weight_from(
            self.best_policy.network.state_dict())

        # define loss and optimizer
        self.nll_loss_function = nn.NLLLoss()
        self.optimizer = torch.optim.SGD(
            self.cloned_baseline_policy.network.parameters(), lr=learning_rate)
        # optimizer = torch.optim.RMSprop(network.parameters(), lr=learning_rate, alpha=0.95, eps=1e-07)

        # instantiate environment for policy evaluation
        self.env = environment.Environment(self.params['domain'], self.params)

        if sample_from_env:
            print("sampling from environment")
            baseline_network_path = os.path.join(data_dir,
                                                 self.params["network_path"])
            self.baseline = Baseline(self.params['network_size'],
                                     network_path=baseline_network_path,
                                     state_shape=self.params['state_shape'],
                                     nb_actions=self.params['nb_actions'],
                                     device=device,
                                     seed=seed,
                                     temperature=self.params.get(
                                         "baseline_temp", 0.1),
                                     normalize=self.params['normalize'])
        else:
            self.baseline = None
Esempio n. 11
0
    def train(self, current_epoch=0):
        for step in range(self.training_steps):
            # clear gradients
            self.optimizer.zero_grad()

            # sample mini_batch
            if not self.sample_from_env:
                s, a, behavior_policy, _, _, _, _, _, _ = self.dataset_train.sample(
                    mini_batch_size=self.mini_batch_size, full_batch=True)
            else:
                # sanity check: train on new samples instead of fixed dataset
                mini_batch = Dataset_Counts(
                    state_shape=self.params['state_shape'],
                    nb_actions=self.params['nb_actions'],
                    count_param=0.2)
                while mini_batch.size < self.mini_batch_size:
                    state = self.env.reset()
                    action, _, policy, _ = self.baseline.inference(state)
                    _, new_reward, term, _ = self.env.step(action)
                    mini_batch.add(s=state.astype('float32'),
                                   a=action,
                                   r=new_reward,
                                   t=term,
                                   p=policy)
                s, a, behavior_policy, _, _, _, _, _, _ = mini_batch.get_all_data(
                )

            # prepare tensors
            batch_states = torch.FloatTensor(s).to(self.device)
            batch_states = torch.squeeze(batch_states)
            target = torch.LongTensor(a).to(
                self.device
            )  # NLLLoss gets the indexes of the correct class as input

            # get predictions
            cloned_policy_on_s = self.cloned_baseline_policy.policy(
                batch_states)

            # computing losses
            # negative loglikelihood
            nll_loss = self.nll_loss_function(torch.log(cloned_policy_on_s),
                                              target)

            # policy entropy
            cloned_policy_entropy = torch.mean(
                distributions.Categorical(cloned_policy_on_s).entropy())
            # regularize entropy
            entropy_bonus = self.entropy_coefficient * cloned_policy_entropy

            total_loss = nll_loss - entropy_bonus

            if step % self.log_frequency == 0:
                total_steps = current_epoch * self.training_steps + step
                self.test_and_log_stats(a, behavior_policy, entropy_bonus,
                                        cloned_policy_on_s,
                                        cloned_policy_entropy, nll_loss,
                                        total_loss, total_steps)

            # update weights
            total_loss.backward()
            self.optimizer.step()
Esempio n. 12
0
def run(config_file, options):
    try:
        params = yaml.safe_load(open(config_file, 'r'))
    except FileNotFoundError as e:
        print("Configuration file not found")
        raise e


    # replacing params with command line options
    for opt in options:
        assert opt[0] in params
        dtype = type(params[opt[0]])
        if dtype == bool:
            new_opt = False if opt[1] != 'True' else True
        else:
            new_opt = dtype(opt[1])
        params[opt[0]] = new_opt

    print('\n')
    print('Parameters ')
    for key in params:
        print(key, params[key])
    print('\n')

    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    random_state = np.random.RandomState(params['seed'])
    device = torch.device(params["device"])

    DATA_DIR = os.path.join(params['folder_location'], params['folder_name'])

    env = environment.Environment(params["domain"], params, random_state)

    if params['batch']:
        dataset_path = params['dataset_path']
        print("\nLoading dataset from file {}".format(dataset_path), flush=True)
        if not os.path.exists(dataset_path):
            raise ValueError("The dataset file does not exist")
        dataset = Dataset_Counts.load_dataset(dataset_path)

        baseline_path = os.path.join(DATA_DIR, params['baseline_path'])
        if 'behavior_cloning' in params['learning_type']:
            baseline_path = os.path.join(os.path.dirname(dataset_path), 'cloned_network_weights.pt')
            baseline = ClonedBaseline(
                params['network_size'], network_path=baseline_path, state_shape=params['state_shape'],
                nb_actions=params['nb_actions'], device=device, seed=params['seed'],
                temperature=params['baseline_temp'], normalize=params['normalize'])
        elif params['learning_type'] in ['pi_b', 'soft_sort']:
            baseline = Baseline(params['network_size'], network_path=baseline_path, state_shape=params['state_shape'],
                                nb_actions=params['nb_actions'], device=device, seed=params['seed'],
                                temperature=params['baseline_temp'], normalize=params['normalize'])
        elif 'count_based' in params['learning_type']:
            baseline = SimilarityBaseline(dataset=dataset, seed=params['seed'], nb_actions=params['nb_actions'],
                                          results_folder=os.path.dirname(dataset_path))
            baseline.evaluate_baseline(env, number_of_steps=100000, number_of_epochs=1,
                                       verbose=True, save_results=True)
        else:
            # no baseline, should use counters to estimate policy
            baseline = None

        folder_name = os.path.dirname(dataset_path)
        print("Data with counts loaded: {} samples".format(dataset.size), flush=True)
        expt = BatchExperiment(dataset=dataset, env=env, folder_name=folder_name, episode_max_len=params['episode_max_len'],
                               minimum_count=params['minimum_count'], extra_stochasticity=params['extra_stochasticity'],
                               history_len=params['history_len'], max_start_nullops=params['max_start_nullops'],
                               keep_all_logs=False)
    else:
        # Create experiment folder
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)

        folder_name = DATA_DIR
        baseline = None
        expt = DQNExperiment(env=env, ai=None, episode_max_len=params['episode_max_len'], annealing=params['annealing'],
                             history_len=params['history_len'], max_start_nullops=params['max_start_nullops'],
                             replay_min_size=params['replay_min_size'], test_epsilon=params['test_epsilon'],
                             folder_name=folder_name, network_path=params['network_path'],
                             extra_stochasticity=params['extra_stochasticity'], score_window_size=100,
                             keep_all_logs=False)

    for ex in range(params['num_experiments']):
        print('\n')
        print('>>>>> Experiment ', ex, ' >>>>> ',
              params['learning_type'], ' >>>>> Epsilon >>>>> ',
              params['epsilon_soft'], ' >>>>> Minimum Count >>>>> ',
              params['minimum_count'], ' >>>>> Kappa >>>>> ',
              params['kappa'], ' >>>>> ', flush=True)
        print('\n')
        print("\nPROGRESS: {0:02.2f}%\n".format(ex / params['num_experiments'] * 100), flush=True)
        ai = AI(baseline, state_shape=env.state_shape, nb_actions=env.nb_actions, action_dim=params['action_dim'],
                reward_dim=params['reward_dim'], history_len=params['history_len'], gamma=params['gamma'],
                learning_rate=params['learning_rate'], epsilon=params['epsilon'], final_epsilon=params['final_epsilon'],
                test_epsilon=params['test_epsilon'], annealing_steps=params['annealing_steps'], minibatch_size=params['minibatch_size'],
                replay_max_size=params['replay_max_size'], update_freq=params['update_freq'],
                learning_frequency=params['learning_frequency'], ddqn=params['ddqn'], learning_type=params['learning_type'],
                network_size=params['network_size'], normalize=params['normalize'], device=device,
                kappa=params['kappa'], minimum_count=params['minimum_count'], epsilon_soft=params['epsilon_soft'])
        expt.ai = ai
        if not params['batch']:
            # resets dataset for online experiment
            expt.dataset_counter = Dataset_Counts(count_param=params['count_param'],
                                                  state_shape=env.state_shape,
                                                  nb_actions=env.nb_actions,
                                                  replay_max_size=params['replay_max_size'],
                                                  is_counting=ai.needs_state_action_counter())

        env.reset()
        with open(expt.folder_name + '/config.yaml', 'w') as y:
            yaml.safe_dump(params, y)  # saving params for reference
        expt.do_epochs(number_of_epochs=params['num_epochs'], is_learning=params['is_learning'],
                       steps_per_epoch=params['steps_per_epoch'], is_testing=params['is_testing'],
                       steps_per_test=params['steps_per_test'],
                       passes_on_dataset=params['passes_on_dataset'], exp_id=ex)
Esempio n. 13
0
 def setUp(self):
     self.dataset = Dataset_Counts(state_shape=[2],
                                   nb_actions=3,
                                   count_param=0.2)
     for t in transitions:
         self.dataset.add(*t)
Esempio n. 14
0
 def setUp(self):
     self.dataset = Dataset_Counts(state_shape=[2],
                                   nb_actions=3,
                                   count_param=0.2)