def test_empty_test(self): dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2) dataset.add(*transitions[0]) dataset.add(*transitions[1]) dataset_train, dataset_test = dataset.train_validation_split( test_size=0.5) s_train, a_train, p_train, r_train, s2_train, t_train, _, _, _ = dataset_train._get_transition( 0) s_test, a_test, p_test, r_test, s2_test, t_test, _, _, _ = dataset_test._get_transition( 0) if a_train == transitions[0].a: trans_train = transitions[0] trans_test = transitions[1] else: trans_train = transitions[1] trans_test = transitions[0] assert_sequence_almost_equal(self, s_train, trans_train.s) self.assertEqual(a_train, trans_train.a) assert_sequence_almost_equal(self, p_train, trans_train.p) self.assertAlmostEqual(r_train, trans_train.r) self.assertEqual(t_train, trans_train.t) assert_sequence_almost_equal(self, s_test, trans_test.s) self.assertEqual(a_test, trans_test.a) assert_sequence_almost_equal(self, p_test, trans_test.p) self.assertAlmostEqual(r_test, trans_test.r) self.assertEqual(t_test, trans_test.t)
def setUp(self): self.dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2) self.dataset_size = 100 for i in np.random.randint(0, len(transitions), self.dataset_size): self.dataset.add(*transitions[i])
def compute_counts(dataset, overwrite=False, count_param=0.2): """ Compute the pseudo-counts for each state-action pair present in the dataset following the methodology described in the paper. Args: dataset: the dataset instance for which to computed counts overwrite: whether to overwrite an existing counts file of the same size, generated with the same seed and same noise_factor. Returns: Saves the dataset augmented with the counts in /dataset/{dataset_size}/{seed}/{noise_factor}/counts_dataset.pkl. """ full_path = os.path.join(dataset.path, dataset.dataset_folder, COUNTS_SUFFIX) if os.path.isfile(full_path): if overwrite: print("Found existing counts file. Overwriting.", flush=True) os.remove(full_path) else: print("Found existing counts file. Aborting.", flush=True) return t = time.time() print("Computing counts. The dataset contains {} transitions.".format(len(dataset.states)), flush=True) d = Dataset_Counts.from_dataset(dataset, count_param) print("Saving data with counts to {}".format(full_path), flush=True) d.save_dataset(full_path) print("Data with counts saved, {} samples".format(d.size), flush=True) print("Counts computed in " + str(time.time() - t) + " seconds", flush=True)
class TestDatasetCounts(TestCase): def setUp(self): self.dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2) for t in transitions: self.dataset.add(*t) def test_transition_to_same_state(self): s, a, _, r, s2, t, c, p, c1 = self.dataset._get_transition(0) self.assertSequenceAlmostEqual(s, transitions[0].s) self.assertEqual(a, transitions[0].a) self.assertAlmostEqual(r, transitions[0].r) self.assertSequenceAlmostEqual(s2, transitions[1].s) self.assertEqual(t, transitions[0].t) self.assertSequenceAlmostEqual(c, self.dataset.c[1]) self.assertSequenceAlmostEqual(p, transitions[1].p) self.assertAlmostEqual(c1, 1.75) def test_transition_to_different_state(self): s, a, _, r, s2, t, c, p, c1 = self.dataset._get_transition(1) self.assertSequenceAlmostEqual(s, transitions[1].s) self.assertEqual(a, transitions[1].a) self.assertAlmostEqual(r, transitions[1].r) self.assertSequenceAlmostEqual(s2, transitions[2].s) self.assertEqual(t, transitions[1].t) self.assertSequenceAlmostEqual(c, self.dataset.c[2]) self.assertSequenceAlmostEqual(c, [1, 0, 0]) self.assertSequenceAlmostEqual(p, transitions[2].p) self.assertAlmostEqual(c1, 1) def test_terminal_transition(self): s, a, _, r, s2, t, c, p, c1 = self.dataset._get_transition(3) self.assertSequenceAlmostEqual(s, transitions[3].s) self.assertEqual(a, transitions[3].a) self.assertAlmostEqual(r, transitions[3].r) self.assertSequenceAlmostEqual(s2, [0, 0]) self.assertEqual(t, True) self.assertSequenceAlmostEqual(c, [0, 0, 0]) self.assertSequenceAlmostEqual(p, [0, 0, 0]) self.assertAlmostEqual(c1, 1.75) def test_save_and_load(self): self.dataset.save_dataset('tmp.pickle') new_dataset = Dataset_Counts.load_dataset('tmp.pickle') self.assertSequenceAlmostEqual(self.dataset.a[0:self.dataset.size], new_dataset.a[0:self.dataset.size]) self.assertSequenceAlmostEqual(self.dataset.t[0:self.dataset.size], new_dataset.t[0:self.dataset.size]) self.assertSequenceAlmostEqual(self.dataset.r[0:self.dataset.size], new_dataset.r[0:self.dataset.size]) for i in range(self.dataset.size): self.assertSequenceAlmostEqual(self.dataset.s[i], new_dataset.s[i]) self.assertSequenceAlmostEqual(self.dataset.c[i], new_dataset.c[i]) new_dataset.add(*transitions[0]) import os os.remove('tmp.pickle') def assertSequenceAlmostEqual(self, it1, it2): assert_sequence_almost_equal(self, it1, it2)
def test_save_and_load(self): self.dataset.save_dataset('tmp.pickle') new_dataset = Dataset_Counts.load_dataset('tmp.pickle') self.assertSequenceAlmostEqual(self.dataset.a[0:self.dataset.size], new_dataset.a[0:self.dataset.size]) self.assertSequenceAlmostEqual(self.dataset.t[0:self.dataset.size], new_dataset.t[0:self.dataset.size]) self.assertSequenceAlmostEqual(self.dataset.r[0:self.dataset.size], new_dataset.r[0:self.dataset.size]) for i in range(self.dataset.size): self.assertSequenceAlmostEqual(self.dataset.s[i], new_dataset.s[i]) self.assertSequenceAlmostEqual(self.dataset.c[i], new_dataset.c[i]) new_dataset.add(*transitions[0]) import os os.remove('tmp.pickle')
class TestSplittingDataset(TestCase): def setUp(self): self.dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2) self.dataset_size = 100 for i in np.random.randint(0, len(transitions), self.dataset_size): self.dataset.add(*transitions[i]) def test_empty_test(self): dataset_train, dataset_test = self.dataset.train_validation_split( test_size=0) self.assertEqual(dataset_test.size, 0) self.assertEqual(dataset_train.size, self.dataset_size) def test_default(self): dataset_train, dataset_test = self.dataset.train_validation_split() self.assertEqual(dataset_test.size, 20) self.assertEqual(dataset_train.size, 80) def test_empty_train(self): dataset_train, dataset_test = self.dataset.train_validation_split(1) self.assertEqual(dataset_test.size, self.dataset_size) self.assertEqual(dataset_train.size, 0) def test_original_data_set_does_not_change(self): random_ind = np.random.randint(0, self.dataset_size) s, a, p1, r, s2, t, c, p2, c1 = self.dataset._get_transition( random_ind) _, _ = self.dataset.train_validation_split(np.random.rand()) new_s, new_a, new_p1, new_r, new_s2, new_t, new_c, new_p2, new_c1 = self.dataset._get_transition( random_ind) assert_sequence_almost_equal(self, new_s, s) self.assertEqual(new_a, a) assert_sequence_almost_equal(self, new_p1, p1) self.assertAlmostEqual(new_r, r) assert_sequence_almost_equal(self, new_s2, s2) self.assertEqual(new_t, t) assert_sequence_almost_equal(self, new_c, c) assert_sequence_almost_equal(self, new_p2, p2) self.assertAlmostEqual(new_c1, c1)
class TestDatasetCountsAdd(TestCase): def setUp(self): self.dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2) def test_populate(self): self.assertEqual(self.dataset.size, 0) self.dataset.add(*transitions[0]) self.assertEqual(self.dataset.size, 1) self.assertAlmostEqual(self.dataset.c[0][1], 1) self.dataset.add(s=[2, 2], a=2, r=0, t=False, p=[.2, .8, 0]) self.assertEqual(self.dataset.size, 2) self.assertAlmostEqual(self.dataset.c[0][1], 1) self.assertAlmostEqual(self.dataset.c[0][2], 1) self.assertAlmostEqual(self.dataset.c[1][1], 1) self.assertAlmostEqual(self.dataset.c[1][2], 1) self.dataset.add(s=[10, 10], a=0, r=0, t=False, p=[.2, .8, 0]) self.assertAlmostEqual(self.dataset.c[0][0], 0) self.assertAlmostEqual(self.dataset.c[0][1], 1) self.assertAlmostEqual(self.dataset.c[0][2], 1) self.assertAlmostEqual(self.dataset.c[1][1], 1) self.assertAlmostEqual(self.dataset.c[1][2], 1) self.assertAlmostEqual(self.dataset.c[2][0], 1) self.dataset.add(s=[2.03, 2.04], a=1, r=0, t=True, p=[.2, .8, 0]) self.assertAlmostEqual(self.dataset.c[0][0], 0) self.assertAlmostEqual(self.dataset.c[0][1], 1.75) self.assertAlmostEqual(self.dataset.c[0][2], 1) self.assertAlmostEqual(self.dataset.c[1][0], 0) self.assertAlmostEqual(self.dataset.c[1][1], 1.75) self.assertAlmostEqual(self.dataset.c[1][2], 1) self.assertAlmostEqual(self.dataset.c[3][0], 0) self.assertAlmostEqual(self.dataset.c[3][1], 1.75) self.assertAlmostEqual(self.dataset.c[3][2], 0.75)
def compute_counts(dataset, overwrite=False, param=0.2): """ Compute the pseudo-counts for each state-action pair present in the dataset following the methodology described in the paper. Args: dataset: the dataset instance for which to computed counts overwrite: whether to overwrite an existing counts file of the same size, generated with the same seed and same noise_factor. Returns: Saves the dataset augmented with the counts in /dataset/{dataset_size}/{seed}/{noise_factor}/counts_dataset.pkl. """ full_path = os.path.join(dataset.path, dataset.dataset_folder, COUNTS_SUFFIX) if os.path.isfile(full_path): if overwrite: print("Found existing counts file. Overwriting.", flush=True) os.remove(full_path) else: print("Found existing counts file. Aborting.", flush=True) return t = time.time() print("Computing counts. The dataset contains {} transitions.".format( len(dataset.states)), flush=True) data = {} data['s'] = np.zeros([len(dataset.states) - 1] + list(dataset.state_shape), dtype='float32') data['s2'] = np.zeros([len(dataset.states) - 1] + list(dataset.state_shape), dtype='float32') data['a'] = np.zeros((len(dataset.states) - 1), dtype='int32') data['r'] = np.zeros((len(dataset.states) - 1), dtype='float32') data['t'] = np.zeros((len(dataset.states) - 1), dtype='bool') data['c'] = np.zeros((len(dataset.states) - 1, dataset.nb_actions), dtype='float32') data['c1'] = np.zeros((len(dataset.states) - 1), dtype='float32') data['p'] = np.zeros((len(dataset.states) - 1, dataset.nb_actions), dtype='float32') data['q'] = np.zeros((len(dataset.states) - 1, dataset.nb_actions), dtype='float32') mean, std = dataset.counts_weights() for i in range(len(dataset.states) - 1): if i % 1000 == 999: print('{} samples processed'.format(i)) data['s'][i] = dataset.states[i] data['a'][i] = dataset.actions[i] data['r'][i] = dataset.rewards[i] for j in range(len(dataset.states) - 1): if dataset.actions[i] == dataset.actions[j]: s = Dataset_Counts.similarite(dataset.states[i], dataset.states[j], param, mean, std) data['c1'][i] += s if dataset.terms[i]: data['t'][i] = True else: data['s2'][i] = dataset.states[i + 1] data['p'][i] = dataset.policy[i + 1] data['q'][i] = dataset.qfunction[i + 1] for j in range(len(dataset.states) - 1): s = Dataset_Counts.similarite(dataset.states[i + 1], dataset.states[j], param, mean, std) data['c'][i, dataset.actions[j]] += s print("Saving data with counts to {}".format(full_path), flush=True) with open(full_path, "wb") as f: pickle.dump(data, f) print("Data with counts saved, {} samples".format(len(data['s'])), flush=True) print("Counts computed in " + str(time.time() - t) + " seconds", flush=True)
def run(domain, config, options): dir_path = os.path.dirname(os.path.realpath(__file__)) if not config: config = 'config_' + domain cfg_file = os.path.join(dir_path, config + '.yaml') params = yaml.safe_load(open(cfg_file, 'r')) # replacing params with command line options for opt in options: assert opt[0] in params dtype = type(params[opt[0]]) if dtype == bool: new_opt = False if opt[1] != 'True' else True else: new_opt = dtype(opt[1]) params[opt[0]] = new_opt print('\n') print('Parameters ') for key in params: print(key, params[key]) print('\n') np.random.seed(params['seed']) torch.manual_seed(params['seed']) random_state = np.random.RandomState(params['seed']) device = torch.device(params["device"]) DATA_DIR = os.path.join(params['folder_location'], params['folder_name']) env = environment.Environment(domain, params, random_state) if params['batch']: from baseline import Baseline baseline_path = os.path.join(DATA_DIR, params['baseline_path']) baseline = Baseline(baseline_path, params['network_size'], state_shape=params['state_shape'], nb_actions=params['nb_actions'], seed=params['seed'], temperature=params['baseline_temp'], device=params['device'], normalize=params['normalize']) dataset_path = os.path.join(DATA_DIR, params['dataset_path']) print("\nLoading dataset from file {}".format(dataset_path), flush=True) if not os.path.exists(dataset_path): raise ValueError("The dataset file does not exist") with open(dataset_path, "rb") as f: data = pickle.load(f) dataset = Dataset_Counts(data, params['count_param']) print("Data with counts loaded: {} samples".format(len(data['s'])), flush=True) folder_name = os.path.dirname(dataset_path) expt = BatchExperiment( dataset=dataset, env=env, folder_name=folder_name, episode_max_len=params['episode_max_len'], minimum_count=params['minimum_count'], extra_stochasticity=params['extra_stochasticity'], history_len=params['history_len'], max_start_nullops=params['max_start_nullops']) else: # Create experiment folder if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) baseline = None expt = DQNExperiment(env=env, ai=None, episode_max_len=params['episode_max_len'], annealing=params['annealing'], history_len=params['history_len'], max_start_nullops=params['max_start_nullops'], replay_min_size=params['replay_min_size'], test_epsilon=params['test_epsilon'], folder_name=DATA_DIR, network_path=params['network_path'], extra_stochasticity=params['extra_stochasticity'], score_window_size=100) for ex in range(params['num_experiments']): print('\n') print('>>>>> Experiment ', ex, ' >>>>> ', params['learning_type'], ' >>>>> Epsilon >>>>> ', params['epsilon_soft'], ' >>>>> Minimum Count >>>>> ', params['minimum_count'], ' >>>>> Kappa >>>>> ', params['kappa'], ' >>>>> ', flush=True) print('\n') ai = AI(baseline, state_shape=env.state_shape, nb_actions=env.nb_actions, action_dim=params['action_dim'], reward_dim=params['reward_dim'], history_len=params['history_len'], gamma=params['gamma'], learning_rate=params['learning_rate'], epsilon=params['epsilon'], final_epsilon=params['final_epsilon'], test_epsilon=params['test_epsilon'], annealing_steps=params['annealing_steps'], minibatch_size=params['minibatch_size'], replay_max_size=params['replay_max_size'], update_freq=params['update_freq'], learning_frequency=params['learning_frequency'], ddqn=params['ddqn'], learning_type=params['learning_type'], network_size=params['network_size'], normalize=params['normalize'], device=device, kappa=params['kappa'], minimum_count=params['minimum_count'], epsilon_soft=params['epsilon_soft']) expt.ai = ai env.reset() with open(expt.folder_name + '/config.yaml', 'w') as y: yaml.safe_dump(params, y) # saving params for reference expt.do_epochs(number_of_epochs=params['num_epochs'], is_learning=params['is_learning'], steps_per_epoch=params['steps_per_epoch'], is_testing=params['is_testing'], steps_per_test=params['steps_per_test'], passes_on_dataset=params['passes_on_dataset'], exp_id=ex)
def __init__(self, training_steps, validation_steps, validation_size, mini_batch_size, learning_rate, number_of_epochs, network_size, folder_location, dataset_file, cloned_network_path, sample_from_env, entropy_coefficient, device, seed, experiment_name, config_file, update_learning_rate): self.sample_from_env = sample_from_env self.smaller_validation_loss = None self.seed = seed try: self.params = yaml.safe_load(open(config_file, 'r')) except FileNotFoundError as e: print( "Configuration file not found; Define a config_file to be able to sample from environment" ) raise e # initialize seeds for reproducibility np.random.seed(seed) torch.manual_seed(seed) # set paths for data and output path log_path = os.path.join('./logs/' + experiment_name) data_dir = folder_location dataset_path = dataset_file self.output_folder = os.path.dirname(dataset_path) self.cloned_network_path = os.path.join(os.path.dirname(dataset_path), cloned_network_path) # start self.logger = SummaryWriter(log_path) # import data full_dataset = Dataset_Counts.load_dataset(dataset_path) self.dataset_train, self.dataset_validation = full_dataset.train_validation_split( test_size=validation_size) # set training parameters self.mini_batch_size = mini_batch_size self.number_of_epochs = number_of_epochs self.network_size = network_size self.entropy_coefficient = entropy_coefficient self.device = device self.learning_rate = learning_rate self.update_learning_rate = update_learning_rate if training_steps != 0: self.training_steps = training_steps else: self.training_steps = int(self.dataset_train.size / self.mini_batch_size) if validation_steps != 0: self.validation_steps = validation_steps else: self.validation_steps = int(self.dataset_validation.size / self.mini_batch_size) self.log_frequency = int(self.training_steps / 10) print( "Training with {} training steps and {} validation steps ".format( self.training_steps, self.validation_steps)) # create model self.cloned_baseline_policy = ClonedBaseline( network_size=network_size, network_path=None, state_shape=self.params['state_shape'], nb_actions=self.params['nb_actions'], device=device, seed=seed, temperature=0) self.best_policy = ClonedBaseline( network_size=network_size, network_path=None, state_shape=self.params['state_shape'], nb_actions=self.params['nb_actions'], device=device, seed=seed, temperature=0, results_folder=self.output_folder) self.best_policy._copy_weight_from( self.best_policy.network.state_dict()) # define loss and optimizer self.nll_loss_function = nn.NLLLoss() self.optimizer = torch.optim.SGD( self.cloned_baseline_policy.network.parameters(), lr=learning_rate) # optimizer = torch.optim.RMSprop(network.parameters(), lr=learning_rate, alpha=0.95, eps=1e-07) # instantiate environment for policy evaluation self.env = environment.Environment(self.params['domain'], self.params) if sample_from_env: print("sampling from environment") baseline_network_path = os.path.join(data_dir, self.params["network_path"]) self.baseline = Baseline(self.params['network_size'], network_path=baseline_network_path, state_shape=self.params['state_shape'], nb_actions=self.params['nb_actions'], device=device, seed=seed, temperature=self.params.get( "baseline_temp", 0.1), normalize=self.params['normalize']) else: self.baseline = None
def train(self, current_epoch=0): for step in range(self.training_steps): # clear gradients self.optimizer.zero_grad() # sample mini_batch if not self.sample_from_env: s, a, behavior_policy, _, _, _, _, _, _ = self.dataset_train.sample( mini_batch_size=self.mini_batch_size, full_batch=True) else: # sanity check: train on new samples instead of fixed dataset mini_batch = Dataset_Counts( state_shape=self.params['state_shape'], nb_actions=self.params['nb_actions'], count_param=0.2) while mini_batch.size < self.mini_batch_size: state = self.env.reset() action, _, policy, _ = self.baseline.inference(state) _, new_reward, term, _ = self.env.step(action) mini_batch.add(s=state.astype('float32'), a=action, r=new_reward, t=term, p=policy) s, a, behavior_policy, _, _, _, _, _, _ = mini_batch.get_all_data( ) # prepare tensors batch_states = torch.FloatTensor(s).to(self.device) batch_states = torch.squeeze(batch_states) target = torch.LongTensor(a).to( self.device ) # NLLLoss gets the indexes of the correct class as input # get predictions cloned_policy_on_s = self.cloned_baseline_policy.policy( batch_states) # computing losses # negative loglikelihood nll_loss = self.nll_loss_function(torch.log(cloned_policy_on_s), target) # policy entropy cloned_policy_entropy = torch.mean( distributions.Categorical(cloned_policy_on_s).entropy()) # regularize entropy entropy_bonus = self.entropy_coefficient * cloned_policy_entropy total_loss = nll_loss - entropy_bonus if step % self.log_frequency == 0: total_steps = current_epoch * self.training_steps + step self.test_and_log_stats(a, behavior_policy, entropy_bonus, cloned_policy_on_s, cloned_policy_entropy, nll_loss, total_loss, total_steps) # update weights total_loss.backward() self.optimizer.step()
def run(config_file, options): try: params = yaml.safe_load(open(config_file, 'r')) except FileNotFoundError as e: print("Configuration file not found") raise e # replacing params with command line options for opt in options: assert opt[0] in params dtype = type(params[opt[0]]) if dtype == bool: new_opt = False if opt[1] != 'True' else True else: new_opt = dtype(opt[1]) params[opt[0]] = new_opt print('\n') print('Parameters ') for key in params: print(key, params[key]) print('\n') np.random.seed(params['seed']) torch.manual_seed(params['seed']) random_state = np.random.RandomState(params['seed']) device = torch.device(params["device"]) DATA_DIR = os.path.join(params['folder_location'], params['folder_name']) env = environment.Environment(params["domain"], params, random_state) if params['batch']: dataset_path = params['dataset_path'] print("\nLoading dataset from file {}".format(dataset_path), flush=True) if not os.path.exists(dataset_path): raise ValueError("The dataset file does not exist") dataset = Dataset_Counts.load_dataset(dataset_path) baseline_path = os.path.join(DATA_DIR, params['baseline_path']) if 'behavior_cloning' in params['learning_type']: baseline_path = os.path.join(os.path.dirname(dataset_path), 'cloned_network_weights.pt') baseline = ClonedBaseline( params['network_size'], network_path=baseline_path, state_shape=params['state_shape'], nb_actions=params['nb_actions'], device=device, seed=params['seed'], temperature=params['baseline_temp'], normalize=params['normalize']) elif params['learning_type'] in ['pi_b', 'soft_sort']: baseline = Baseline(params['network_size'], network_path=baseline_path, state_shape=params['state_shape'], nb_actions=params['nb_actions'], device=device, seed=params['seed'], temperature=params['baseline_temp'], normalize=params['normalize']) elif 'count_based' in params['learning_type']: baseline = SimilarityBaseline(dataset=dataset, seed=params['seed'], nb_actions=params['nb_actions'], results_folder=os.path.dirname(dataset_path)) baseline.evaluate_baseline(env, number_of_steps=100000, number_of_epochs=1, verbose=True, save_results=True) else: # no baseline, should use counters to estimate policy baseline = None folder_name = os.path.dirname(dataset_path) print("Data with counts loaded: {} samples".format(dataset.size), flush=True) expt = BatchExperiment(dataset=dataset, env=env, folder_name=folder_name, episode_max_len=params['episode_max_len'], minimum_count=params['minimum_count'], extra_stochasticity=params['extra_stochasticity'], history_len=params['history_len'], max_start_nullops=params['max_start_nullops'], keep_all_logs=False) else: # Create experiment folder if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) folder_name = DATA_DIR baseline = None expt = DQNExperiment(env=env, ai=None, episode_max_len=params['episode_max_len'], annealing=params['annealing'], history_len=params['history_len'], max_start_nullops=params['max_start_nullops'], replay_min_size=params['replay_min_size'], test_epsilon=params['test_epsilon'], folder_name=folder_name, network_path=params['network_path'], extra_stochasticity=params['extra_stochasticity'], score_window_size=100, keep_all_logs=False) for ex in range(params['num_experiments']): print('\n') print('>>>>> Experiment ', ex, ' >>>>> ', params['learning_type'], ' >>>>> Epsilon >>>>> ', params['epsilon_soft'], ' >>>>> Minimum Count >>>>> ', params['minimum_count'], ' >>>>> Kappa >>>>> ', params['kappa'], ' >>>>> ', flush=True) print('\n') print("\nPROGRESS: {0:02.2f}%\n".format(ex / params['num_experiments'] * 100), flush=True) ai = AI(baseline, state_shape=env.state_shape, nb_actions=env.nb_actions, action_dim=params['action_dim'], reward_dim=params['reward_dim'], history_len=params['history_len'], gamma=params['gamma'], learning_rate=params['learning_rate'], epsilon=params['epsilon'], final_epsilon=params['final_epsilon'], test_epsilon=params['test_epsilon'], annealing_steps=params['annealing_steps'], minibatch_size=params['minibatch_size'], replay_max_size=params['replay_max_size'], update_freq=params['update_freq'], learning_frequency=params['learning_frequency'], ddqn=params['ddqn'], learning_type=params['learning_type'], network_size=params['network_size'], normalize=params['normalize'], device=device, kappa=params['kappa'], minimum_count=params['minimum_count'], epsilon_soft=params['epsilon_soft']) expt.ai = ai if not params['batch']: # resets dataset for online experiment expt.dataset_counter = Dataset_Counts(count_param=params['count_param'], state_shape=env.state_shape, nb_actions=env.nb_actions, replay_max_size=params['replay_max_size'], is_counting=ai.needs_state_action_counter()) env.reset() with open(expt.folder_name + '/config.yaml', 'w') as y: yaml.safe_dump(params, y) # saving params for reference expt.do_epochs(number_of_epochs=params['num_epochs'], is_learning=params['is_learning'], steps_per_epoch=params['steps_per_epoch'], is_testing=params['is_testing'], steps_per_test=params['steps_per_test'], passes_on_dataset=params['passes_on_dataset'], exp_id=ex)
def setUp(self): self.dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2) for t in transitions: self.dataset.add(*t)
def setUp(self): self.dataset = Dataset_Counts(state_shape=[2], nb_actions=3, count_param=0.2)