def test_function(config, config_suffix=None): config_main = config['main'] config_probe = config['probe'] config_VAE = config['VAE'] config_DDQN = config['DDQN'] config_PER = config['PER'] config_ablation = config['ablation'] use_pi_e = config_ablation['use_pi_e'] phase = config_main['phase'] assert (phase == 'validation' or phase == 'test') domain = config_main['domain'] # Domain-specific parameters (e.g. state and action space dimensions) if domain == '2D': domain_name = "config_2D.json" elif domain == 'acrobot': domain_name = "config_acrobot.json" elif domain == 'hiv': if config_suffix is not None: domain_name = "config_hiv{}.json".format(config_suffix) else: domain_name = "config_hiv.json" elif domain == 'mujoco': domain_name = "config_mujoco.json" elif domain == 'cancer': domain_name = "config_cancer.json" else: raise ValueError("test_ablation.py : domain not recognized") with open(domain_name) as f: config_domain = json.load(f) n_state = config_domain['n_state'] n_action = config_domain['n_action'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) N_instances = config_domain['N_test_instances'] N_episodes = config_domain['N_test_episodes'] test_steps = config_domain['test_steps'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] # Instantiate HPMDP hpmdp = HiPMDP.HiPMDP(domain, config_domain, phase) # Instantiate probe policy n_probe_steps = config_domain['traj_length'] assert (n_probe_steps < test_steps) if use_pi_e: pi_e = probe.Probe(config_probe, n_state, n_action) else: # initial z z_avg = pickle.load(open('../results/%s/z_avg.p' % dir_name, 'rb')) # Instantiate VAE buffer_size_vae = config_VAE['buffer_size'] batch_size_vae = config_VAE['batch_size'] del config_VAE['buffer_size'] vae = vae_import.VAE(n_state, n_action, n_probe_steps, seed=seed, **config_VAE) # Instantiate control policy if config_DDQN['activate']: pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], config_VAE['n_latent']) # TF session config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) saver = tf.train.Saver() print("Restoring variables from %s" % dir_name) saver.restore(sess, '../results/%s/%s' % (dir_name, model_name)) reward_total = 0 cumulative_reward = np.zeros((test_steps, N_instances)) # Iterate through random instances from the HPMDP for idx_instance in range(1, N_instances + 1): hpmdp.switch_instance() print("idx_instance", idx_instance, " | Switching instance to", hpmdp.instance_param_set) # N_episodes should be 1, but we let it be flexible in case needed for idx_episode in range(1, N_episodes + 1): reward_episode = 0 collected_probe_traj = False while not collected_probe_traj: # list of (state, action) pairs traj_probe = [] state = hpmdp.reset() episode_step = 0 done = False probe_finished_early = False # Generate probe trajectory for step in range(1, n_probe_steps + 1): if use_pi_e: action = pi_e.run_actor(state, sess) else: action = pi_c.run_actor(state, z_avg, sess, epsilon=0) # print("Probe step %d action %d" % (step, action)) action_1hot = np.zeros(n_action) action_1hot[action] = 1 traj_probe.append((state, action_1hot)) state_next, reward, done = hpmdp.step(action) reward_episode += reward cumulative_reward[episode_step, idx_instance - 1] = reward_episode state = state_next episode_step += 1 if done and step < n_probe_steps: probe_finished_early = True print( "test_ablation.py : done is True while generating probe trajectory" ) break if not probe_finished_early: collected_probe_traj = True # Use VAE to estimate hidden parameter z = vae.encode(sess, traj_probe) print(z) if config_DDQN['activate']: # Start control policy while not done and episode_step < test_steps: # Use DDQN with prioritized replay for this action = pi_c.run_actor(state, z, sess, epsilon=0) state_next, reward, done = hpmdp.step(action) reward_episode += reward cumulative_reward[episode_step, idx_instance - 1] = reward_episode state = state_next episode_step += 1 print(reward_episode) # If episode ended earlier than test_steps, fill in the # rest of the cumulative rewards with the last value if episode_step < test_steps: remaining = np.ones(test_steps - episode_step) * reward_episode cumulative_reward[episode_step:, idx_instance - 1] = remaining reward_total += reward_episode header = 'Step' for idx in range(1, N_instances + 1): header += ',R_%d' % idx indices = np.arange(1, test_steps + 1).reshape(test_steps, 1) concated = np.concatenate([indices, cumulative_reward], axis=1) save_loc = '_'.join(dir_name.split('_')[:-1]) os.makedirs('../results/%s' % save_loc, exist_ok=True) run_number = dir_name.split('_')[-1] np.savetxt('../results/%s/test_%s.csv' % (save_loc, run_number), concated, delimiter=',', fmt='%.3e', header=header) print("Avg episode reward", reward_total / float(N_instances * N_episodes))
def train_function(config, config_suffix=None): # with open('config.json') as f: # config = json.load(f) config_main = config['main'] config_DDQN = config['DDQN'] config_PER = config['PER'] config_baseline = config['baseline'] real_z_input = config_baseline['real_z_input'] if real_z_input: # If use real hidden param as input, then of course DDQN must accept z assert(config_DDQN['z_input']==True) domain = config_main['domain'] # Domain-specific parameters (e.g. state and action space dimensions) if domain == '2D': domain_name = "config_2D.json" elif domain == 'acrobot': domain_name = "config_acrobot.json" elif domain == 'hiv': if config_suffix is not None: domain_name = "config_hiv{}.json".format(config_suffix) else: domain_name = "config_hiv.json" elif domain == 'lander': domain_name = "config_lander.json" elif domain == 'cancer': domain_name = "config_cancer.json" else: raise ValueError("train_baseline.py : domain not recognized") with open(domain_name) as f: config_domain = json.load(f) n_state = config_domain['n_state'] n_action = config_domain['n_action'] n_hidden = config_domain['n_hidden'] # dimension of real hidden param min_samples_before_train = config_domain['min_samples_before_train'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) N_instances = config_main['N_instances'] N_episodes = config_main['N_episodes'] period = config_main['period'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] os.makedirs('../results/%s'%dir_name, exist_ok=True) # Instantiate HPMDP hpmdp = HiPMDP.HiPMDP(domain, config_domain) # Instantiate control policy pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], n_hidden) epsilon_start = config_DDQN['epsilon_start'] epsilon_end = config_DDQN['epsilon_end'] epsilon_decay = np.exp(np.log(epsilon_end/epsilon_start)/(N_instances*N_episodes)) # epsilon_decay = config_DDQN['epsilon_decay'] steps_per_train = config_DDQN['steps_per_train'] # TF session config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) sess.run(pi_c.list_initialize_target_ops) writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph) saver = tf.train.Saver() # use the DQN version of the replay, so instance_count and bnn-specific params do not matter exp_replay_param = {'episode_count':N_instances*N_episodes, 'instance_count':0, 'max_task_examples':hpmdp.max_steps_per_episode, 'ddqn_batch_size':config_DDQN['batch_size'], 'num_strata_samples':config_PER['num_strata_samples'], 'PER_alpha':config_PER['alpha'], 'PER_beta_zero':config_PER['beta_zero'], 'bnn_batch_size':0, 'bnn_start':0, 'dqn_start':min_samples_before_train} buf = ExperienceReplay.ExperienceReplay(exp_replay_param, buffer_size=config_PER['buffer_size']) # Logging header = "Episode,R_avg\n" with open("../results/%s/log.csv" % dir_name, 'w') as f: f.write(header) reward_period = 0 epsilon = epsilon_start control_step = 0 train_count_control = 1 total_episodes = 0 t_start = time.time() # Iterate through random instances from the HPMDP for idx_instance in range(1, N_instances+1): hpmdp.switch_instance() print("idx_instance", idx_instance, " | Switching instance to", hpmdp.instance_param_set) if real_z_input: z = hpmdp.get_real_hidden_param() # Iterate through many episodes for idx_episode in range(1, N_episodes+1): total_episodes += 1 # print("Episode", idx_episode) state = hpmdp.reset() done = False summarized = False reward_episode = 0 # Start control policy while not done: # Use DDQN with prioritized replay for this if real_z_input: action = pi_c.run_actor(state, z, sess, epsilon) else: action = pi_c.run_actor(state, None, sess, epsilon) state_next, reward, done = hpmdp.step(action) control_step += 1 reward_episode += reward if real_z_input: buf.add(np.reshape(np.array([state,action,reward,state_next,done,z]), (1,6))) else: buf.add(np.reshape(np.array([state,action,reward,state_next,done]), (1,5))) state = state_next if control_step >= min_samples_before_train and control_step % steps_per_train == 0: batch, IS_weights, indices = buf.sample(control_step) if (total_episodes % period == 0) and not summarized: # Write TF summary at first train step of the last episode of every instance td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize=True, writer=writer) summarized = True else: td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize=False, writer=writer) train_count_control += 1 if config_PER['activate']: buf.update_priorities( np.hstack( (np.reshape(td_loss, (len(td_loss),-1)), np.reshape(indices, (len(indices),-1))) ) ) reward_period += reward_episode if epsilon > epsilon_end: epsilon *= epsilon_decay # Logging if total_episodes % period == 0: s = "%d,%.2f\n" % (total_episodes, reward_period/float(period)) print(s) with open("../results/%s/log.csv" % dir_name, 'a') as f: f.write(s) reward_period = 0 with open("../results/%s/time.txt" % dir_name, 'a') as f: f.write("%.5e" % (time.time() - t_start)) saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
def test_function(config, config_suffix=None): config_main = config['main'] config_VAE = config['VAE'] config_DDQN = config['DDQN'] config_PER = config['PER'] phase = config_main['phase'] assert (phase == 'validation' or phase == 'test') domain = config_main['domain'] # Domain-specific parameters (e.g. state and action space dimensions) if domain == '2D': domain_name = "config_2D.json" elif domain == 'acrobot': domain_name = "config_acrobot.json" elif domain == 'hiv': if config_suffix is not None: domain_name = "config_hiv{}.json".format(config_suffix) else: domain_name = "config_hiv.json" elif domain == 'mujoco': domain_name = "config_mujoco.json" elif domain == 'cancer': domain_name = "config_cancer.json" else: raise ValueError("train.py : domain not recognized") with open(domain_name) as f: config_domain = json.load(f) n_state = config_domain['n_state'] n_action = config_domain['n_action'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) N_instances = config_domain['N_test_instances'] N_episodes = config_domain['N_test_episodes'] test_steps = config_domain['test_steps'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] # Instantiate HPMDP hpmdp = HiPMDP.HiPMDP(domain, config_domain, phase) # Length of trajectory for input to VAE n_vae_steps = config_domain['traj_length'] n_latent = config_VAE['n_latent'] z = np.zeros(config_VAE['n_latent'], dtype=np.float32) with open('../results/%s/std_max.pkl' % dir_name, 'rb') as f: std_max = pickle.load(f) # Instantiate VAE buffer_size_vae = config_VAE['buffer_size'] batch_size_vae = config_VAE['batch_size'] del config_VAE['buffer_size'] vae = vae_import.VAE(n_state, n_action, n_vae_steps, seed=seed, **config_VAE) # Instantiate control policy if config_DDQN['activate']: pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], config_VAE['n_latent']) # TF session config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) saver = tf.train.Saver() print("Restoring variables from %s" % dir_name) saver.restore(sess, '../results/%s/%s' % (dir_name, model_name)) reward_total = 0 cumulative_reward = np.zeros((test_steps, N_instances)) list_times = [] # Iterate through random instances from the HPMDP for idx_instance in range(1, N_instances + 1): hpmdp.switch_instance() print("idx_instance", idx_instance, " | Switching instance to", hpmdp.instance_param_set) t_start = time.time() for idx_episode in range(1, N_episodes + 1): # rolling window of (state, action) pairs traj_for_vae = [] eta = 1.0 # range [0,1] 1 means the policy should act to maximize probe reward z = np.zeros(config_VAE['n_latent'], dtype=np.float32) reward_episode = 0 state = hpmdp.reset() episode_step = 0 done = False while not done and episode_step < test_steps: action = pi_c.run_actor(state, z, sess, epsilon=0, eta=eta) action_1hot = np.zeros(n_action) action_1hot[action] = 1 traj_for_vae.append((state, action_1hot)) if len(traj_for_vae) == n_vae_steps + 1: traj_for_vae = traj_for_vae[1:] state_next, reward, done = hpmdp.step(action) reward_episode += reward cumulative_reward[episode_step, idx_instance - 1] = reward_episode # Get z_next and eta_next, because they are considered part of the augmented MDP state if len(traj_for_vae) == n_vae_steps: std = vae.get_std(sess, traj_for_vae) std = std / std_max # element-wise normalization, now each element is between [0,1] eta_next = np.sum(std) / n_latent # scalar between [0,1] eta_next = min( 1.0, eta_next ) # in case std_max during training isn't large enough # Use VAE to update hidden parameter z_next = vae.encode(sess, traj_for_vae) else: z_next = z eta_next = eta state = state_next eta = eta_next z = z_next episode_step += 1 # If episode ended earlier than test_steps, fill in the # rest of the cumulative rewards with the last value if episode_step < test_steps: remaining = np.ones(test_steps - episode_step) * reward_episode cumulative_reward[episode_step:, idx_instance - 1] = remaining reward_total += reward_episode list_times.append(time.time() - t_start) header = 'Step' for idx in range(1, N_instances + 1): header += ',R_%d' % idx indices = np.arange(1, test_steps + 1).reshape(test_steps, 1) concated = np.concatenate([indices, cumulative_reward], axis=1) save_loc = '_'.join(dir_name.split('_')[:-1]) os.makedirs('../results/%s' % save_loc, exist_ok=True) run_number = dir_name.split('_')[-1] np.savetxt('../results/%s/test_%s.csv' % (save_loc, run_number), concated, delimiter=',', fmt='%.3e', header=header) with open('../results/%s/test_time_%s.pkl' % (save_loc, run_number), 'wb') as f: pickle.dump(list_times, f) print("Avg episode reward", reward_total / float(N_instances * N_episodes))
def train_function(config, config_suffix=None): config_main = config['main'] config_probe = config['probe'] autoencoder = config_main['autoencoder'] if autoencoder == 'VAE': config_VAE = config['VAE'] else: raise ValueError("Other autoencoders not supported") config_DDQN = config['DDQN'] config_PER = config['PER'] phase = config_main['phase'] assert (phase == 'train') domain = config_main['domain'] # Domain-specific parameters (e.g. state and action space dimensions) if domain == '2D': domain_name = "config_2D.json" elif domain == 'acrobot': domain_name = "config_acrobot.json" elif domain == 'hiv': if config_suffix is not None: domain_name = "config_hiv{}.json".format(config_suffix) else: domain_name = "config_hiv.json" elif domain == 'lander': domain_name = "config_lander.json" elif domain == 'cancer': domain_name = "config_cancer.json" else: raise ValueError("train.py : domain not recognized") with open(domain_name) as f: config_domain = json.load(f) n_state = config_domain['n_state'] n_action = config_domain['n_action'] min_samples_before_train = config_domain['min_samples_before_train'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) N_instances = config_main['N_instances'] N_episodes = config_main['N_episodes'] period = config_main['period'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] os.makedirs('../results/%s' % dir_name, exist_ok=True) # Instantiate HPMDP hpmdp = HiPMDP.HiPMDP(domain, config_domain) # Instantiate probe policy n_probe_steps = config_domain['traj_length'] pi_e = probe.Probe(config_probe, n_state, n_action) # Instantiate VAE buffer_size_vae = config_VAE['buffer_size'] batch_size_vae = config_VAE['batch_size'] del config_VAE['buffer_size'] if autoencoder == 'VAE': vae = vae_import.VAE(n_state, n_action, n_probe_steps, seed=seed, **config_VAE) else: raise ValueError('Other autoencoders not supported') # Instantiate control policy if config_DDQN['activate']: pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], config_VAE['n_latent']) epsilon_start = config_DDQN['epsilon_start'] epsilon_end = config_DDQN['epsilon_end'] epsilon_decay = np.exp( np.log(epsilon_end / epsilon_start) / (N_instances * N_episodes)) steps_per_train = config_DDQN['steps_per_train'] # TF session config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) if config_DDQN['activate']: sess.run(pi_c.list_initialize_target_ops) epsilon = epsilon_start if config_VAE['dual']: sess.run(vae.list_equate_dual_ops) writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph) saver = tf.train.Saver() # use the DQN version of the replay, so instance_count and bnn-specific params do not matter exp_replay_param = { 'episode_count': N_instances * N_episodes, 'instance_count': 0, 'max_task_examples': hpmdp.max_steps_per_episode, 'ddqn_batch_size': config_DDQN['batch_size'], 'num_strata_samples': config_PER['num_strata_samples'], 'PER_alpha': config_PER['alpha'], 'PER_beta_zero': config_PER['beta_zero'], 'bnn_batch_size': 0, 'bnn_start': 0, 'dqn_start': min_samples_before_train } buf = ExperienceReplay.ExperienceReplay( exp_replay_param, buffer_size=config_PER['buffer_size']) # Logging header = "Episode,R_avg,R_p\n" with open("../results/%s/log.csv" % dir_name, 'w') as f: f.write(header) reward_period = 0 reward_p_period = 0 list_trajs = [] # circular buffer to store probe trajectories for VAE idx_traj = 0 # counter for list_trajs control_step = 0 train_count_probe = 1 train_count_vae = 1 train_count_control = 1 total_episodes = 0 t_start = time.time() # Iterate through random instances from the HPMDP for idx_instance in range(1, N_instances + 1): hpmdp.switch_instance() print("idx_instance", idx_instance, " | Switching instance to", hpmdp.instance_param_set) # Iterate through many episodes for idx_episode in range(1, N_episodes + 1): total_episodes += 1 # list of (state, action) pairs traj_probe = [] state = hpmdp.reset() done = False reward_episode = 0 # Generate probe trajectory probe_finished_early = False for step in range(1, n_probe_steps + 1): action = pi_e.run_actor(state, sess) action_1hot = np.zeros(n_action) action_1hot[action] = 1 traj_probe.append((state, action_1hot)) state_next, reward, done = hpmdp.step(action) state = state_next reward_episode += reward if done and step < n_probe_steps: probe_finished_early = True print( "train.py : done is True while generating probe trajectory" ) break if probe_finished_early: # Skip over pi_e and VAE training if probe finished early continue if idx_traj >= len(list_trajs): list_trajs.append(traj_probe) else: list_trajs[idx_traj] = traj_probe idx_traj = (idx_traj + 1) % buffer_size_vae # Compute probe reward using VAE if config_probe['reward'] == 'vae': reward_e = vae.compute_lower_bound(traj_probe, sess) elif config_probe['reward'] == 'total_variation': reward_e = pi_e.compute_reward(traj_probe) elif config_probe['reward'] == 'negvae': # this reward encourages maximizing entropy reward_e = -vae.compute_lower_bound(traj_probe, sess) # Write Tensorboard at the final episode of every instance if total_episodes % period == 0: summarize = True else: summarize = False # Train probe policy pi_e.train_step(sess, traj_probe, reward_e, train_count_probe, summarize, writer) train_count_probe += 1 # Train VAE if len(list_trajs) >= batch_size_vae: vae.train_step(sess, list_trajs, train_count_vae, summarize, writer) train_count_vae += 1 # Use VAE to estimate hidden parameter z = vae.encode(sess, traj_probe) if config_DDQN['activate']: # Start control policy summarized = False while not done: # Use DDQN with prioritized replay for this action = pi_c.run_actor(state, z, sess, epsilon) state_next, reward, done = hpmdp.step(action) control_step += 1 reward_episode += reward buf.add( np.reshape( np.array( [state, action, reward, state_next, done, z]), (1, 6))) state = state_next if control_step >= min_samples_before_train and control_step % steps_per_train == 0: batch, IS_weights, indices = buf.sample(control_step) if not summarized: # Write TF summary at first train step of the last episode of every instance td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize, writer) summarized = True else: td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, False, writer) train_count_control += 1 if config_PER['activate']: buf.update_priorities( np.hstack( (np.reshape(td_loss, (len(td_loss), -1)), np.reshape(indices, (len(indices), -1))))) reward_period += reward_episode reward_p_period += reward_e if epsilon > epsilon_end: epsilon *= epsilon_decay # Logging if total_episodes % period == 0: s = "%d,%.2f,%.2f\n" % (total_episodes, reward_period / float(period), reward_p_period / float(period)) print(s) with open("../results/%s/log.csv" % dir_name, 'a') as f: f.write(s) if config_domain[ 'save_threshold'] and reward_period / float( period) > config_domain['save_threshold']: saver.save( sess, '../results/%s/%s.%d' % (dir_name, model_name, total_episodes)) reward_period = 0 reward_p_period = 0 with open("../results/%s/time.txt" % dir_name, 'a') as f: f.write("%.5e" % (time.time() - t_start)) saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
def train_function(config, config_suffix=None): config_main = config['main'] config_VAE = config['VAE'] config_DDQN = config['DDQN'] config_PER = config['PER'] config_ablation = config['ablation'] eq_rew = config_ablation['equalize_reward'] domain = config_main['domain'] # Domain-specific parameters (e.g. state and action space dimensions) if domain == '2D': domain_name = "config_2D.json" elif domain == 'acrobot': domain_name = "config_acrobot.json" elif domain == 'hiv': if config_suffix is not None: domain_name = "config_hiv{}.json".format(config_suffix) else: domain_name = "config_hiv.json" elif domain == 'mujoco': domain_name = "config_mujoco.json" elif domain == 'cancer': domain_name = "config_cancer.json" else: raise ValueError("train.py : domain not recognized") with open(domain_name) as f: config_domain = json.load(f) n_state = config_domain['n_state'] n_action = config_domain['n_action'] min_samples_before_train = config_domain['min_samples_before_train'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) N_instances = config_main['N_instances'] N_episodes = config_main['N_episodes'] period = config_main['period'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] os.makedirs('../results/%s' % dir_name, exist_ok=True) # Instantiate HPMDP hpmdp = HiPMDP.HiPMDP(domain, config_domain) # Length of trajectory for input to VAE n_vae_steps = config_domain['traj_length'] n_latent = config_VAE['n_latent'] z = np.zeros(config_VAE['n_latent'], dtype=np.float32) eta = 1.0 # range [0,1] 1 means the policy should act to maximize probe reward std_max = -np.inf * np.ones(config_VAE['n_latent'], dtype=np.float32) # Instantiate VAE buffer_size_vae = config_VAE['buffer_size'] batch_size_vae = config_VAE['batch_size'] del config_VAE['buffer_size'] vae = vae_import.VAE(n_state, n_action, n_vae_steps, seed=seed, **config_VAE) # Instantiate control policy if config_DDQN['activate']: pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], config_VAE['n_latent']) epsilon_start = config_DDQN['epsilon_start'] epsilon_end = config_DDQN['epsilon_end'] epsilon_decay = np.exp( np.log(epsilon_end / epsilon_start) / (N_episodes * N_instances)) steps_per_train = config_DDQN['steps_per_train'] # TF session config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) if config_DDQN['activate']: sess.run(pi_c.list_initialize_target_ops) epsilon = epsilon_start if config_VAE['dual']: sess.run(vae.list_equate_dual_ops) writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph) saver = tf.train.Saver() # use the DQN version of the replay, so instance_count and bnn-specific params do not matter exp_replay_param = { 'episode_count': N_instances * N_episodes, 'instance_count': 0, 'max_task_examples': hpmdp.max_steps_per_episode, 'ddqn_batch_size': config_DDQN['batch_size'], 'num_strata_samples': config_PER['num_strata_samples'], 'PER_alpha': config_PER['alpha'], 'PER_beta_zero': config_PER['beta_zero'], 'bnn_batch_size': 0, 'bnn_start': 0, 'dqn_start': min_samples_before_train } buf = ExperienceReplay.ExperienceReplay( exp_replay_param, buffer_size=config_PER['buffer_size']) # running mean and variance of MDP reward and VAE lowerbound if eq_rew: stat_counter = 0 r_mdp_mean = 0 r_mdp_var = 0 r_probe_mean = 0 r_probe_var = 0 # Logging header = "Episode,R_avg,R_e_avg\n" with open("../results/%s/log.csv" % dir_name, 'w') as f: f.write(header) reward_period = 0 reward_e_period = 0 list_trajs = [] # circular buffer to store probe trajectories for VAE idx_traj = 0 # counter for list_trajs control_step = 0 train_count_vae = 1 train_count_control = 1 total_episodes = 0 t_start = time.time() # Iterate through random instances from the HPMDP for idx_instance in range(1, N_instances + 1): hpmdp.switch_instance() print("idx_instance", idx_instance, " | Switching instance to", hpmdp.instance_param_set) # Iterate through many episodes for idx_episode in range(1, N_episodes + 1): total_episodes += 1 eta = 1.0 z = np.zeros(config_VAE['n_latent'], dtype=np.float32) if total_episodes % period == 0: list_eta = [eta] # rolling window of (state, action) pairs traj_for_vae = [] state = hpmdp.reset() done = False reward_episode = 0 reward_e_episode = 0 step_episode = 0 if total_episodes % period == 0: summarize = True else: summarize = False summarized = False while not done: action = pi_c.run_actor(state, z, sess, epsilon, eta) control_step += 1 action_1hot = np.zeros(n_action) action_1hot[action] = 1 traj_for_vae.append((state, action_1hot)) if len(traj_for_vae) == n_vae_steps + 1: traj_for_vae = traj_for_vae[1:] state_next, reward, done = hpmdp.step(action) step_episode += 1 if eq_rew: stat_counter += 1 # update MDP reward mean and var r_mdp_mean_prev = r_mdp_mean r_mdp_mean = 1 / float(stat_counter) * reward + ( stat_counter - 1) / float(stat_counter) * r_mdp_mean r_mdp_var = r_mdp_var + (reward - r_mdp_mean_prev) * ( reward - r_mdp_mean) if len(traj_for_vae) == n_vae_steps: # Compute probe reward using VAE reward_e = vae.compute_lower_bound(traj_for_vae, sess)[0] if eq_rew: # Update probe reward mean and var r_probe_mean_prev = r_probe_mean r_probe_mean = 1 / float(stat_counter) * reward_e + ( stat_counter - 1) / float(stat_counter) * r_probe_mean r_probe_var = r_probe_var + ( reward_e - r_probe_mean_prev) * (reward_e - r_probe_mean) # Scale probe reward into MDP reward reward_e = ( (reward_e - r_probe_mean) / np.sqrt(r_probe_var / stat_counter) + r_mdp_mean) * np.sqrt(r_mdp_var / stat_counter) reward_total = eta * reward_e + (1 - eta) * reward else: reward_e = 0.0 reward_total = reward # Get z_next and eta_next, because they are considered part of the augmented MDP state if len(traj_for_vae) == n_vae_steps: std = vae.get_std(sess, traj_for_vae) # Update max for idx in range(n_latent): if std[idx] >= std_max[idx]: std_max[idx] = std[idx] std = std / std_max # element-wise normalization, now each element is between [0,1] eta_next = np.sum(std) / n_latent # scalar between [0,1] # Use VAE to update hidden parameter z_next = vae.encode(sess, traj_for_vae) else: z_next = z eta_next = eta if total_episodes % period == 0: list_eta.append(eta_next) # Use total reward to train policy buf.add( np.reshape( np.array([ state, z, eta, action, reward_total, state_next, z_next, eta_next, done ]), (1, 9))) state = state_next eta = eta_next z = z_next # Note that for evaluation purpose we record the MDP reward separately reward_episode += reward reward_e_episode += reward_e # Store non-overlapping trajectories for training VAE # if len(traj_for_vae) == n_vae_steps: if step_episode % n_vae_steps == 0: if idx_traj >= len(list_trajs): list_trajs.append( list(traj_for_vae)) # must make a new list else: list_trajs[idx_traj] = list(traj_for_vae) idx_traj = (idx_traj + 1) % buffer_size_vae if control_step >= min_samples_before_train and control_step % steps_per_train == 0: batch, IS_weights, indices = buf.sample(control_step) if not summarized: # Write TF summary at first train step of the last episode of every instance td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize, writer) summarized = True else: td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, False, writer) train_count_control += 1 if config_PER['activate']: buf.update_priorities( np.hstack((np.reshape(td_loss, (len(td_loss), -1)), np.reshape(indices, (len(indices), -1))))) reward_period += reward_episode reward_e_period += reward_e_episode if epsilon > epsilon_end: epsilon *= epsilon_decay # Train VAE at the end of each episode if len(list_trajs) >= batch_size_vae: vae.train_step(sess, list_trajs, train_count_vae, summarize, writer) train_count_vae += 1 # Logging if total_episodes % period == 0: s = "%d,%.2f,%.2f\n" % (total_episodes, reward_period / float(period), reward_e_period / float(period)) print(s) with open("../results/%s/log.csv" % dir_name, 'a') as f: f.write(s) with open("../results/%s/eta.csv" % dir_name, 'a') as f: eta_string = ','.join(['%.2f' % x for x in list_eta]) eta_string += '\n' f.write(eta_string) if config_domain['save_threshold'] and reward_period / float( period) > config_domain['save_threshold']: saver.save( sess, '../results/%s/%s.%d' % (dir_name, model_name, total_episodes)) reward_period = 0 reward_e_period = 0 with open("../results/%s/time.txt" % dir_name, 'a') as f: f.write("%.5e" % (time.time() - t_start)) with open('../results/%s/std_max.pkl' % dir_name, 'wb') as f: pickle.dump(std_max, f) if eq_rew: reward_scaling = np.array([ r_mdp_mean, np.sqrt(r_mdp_var / stat_counter), r_probe_mean, np.sqrt(r_probe_var / stat_counter) ]) with open('../results/%s/reward_scaling.pkl' % dir_name, 'wb') as f: pickle.dump(reward_scaling, f) saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
s= env.reset() return r, np.array(s_), fin def reward2(s, a): global i_epi env.render() i_epi += 1 s_, r, fin, info = env.step(a) r += (1 - abs(s[2])) r += 1 - abs(s[0]) # r+=float(i_epi)/200 r = (r-3.0)*5 # print(np.array(s_)) if fin == 1: if i_epi < 100: r += -1 i_epi = 0 plt.pause(0.01) s= env.reset() return r, np.array(s_), fin Qnet = ddqn.Net(4,2) td = ddqn.DDQN(4,np.array([0,1]),np.array(env.reset()),reward2,0.2, 200,Qnet) td.learn() print(td.Q) env.close()
def train_function(config, config_suffix=None): config_main = config['main'] config_DDQN = config['DDQN'] config_PER = config['PER'] assert (config['baseline']['epopt'] == True) assert (config_DDQN['activate'] == True) n_epopt = config['epopt']['n_epopt'] epsilon_epopt = config['epopt']['epsilon'] domain = config_main['domain'] # Domain-specific parameters (e.g. state and action space dimensions) if domain == '2D': domain_name = "config_2D.json" elif domain == 'acrobot': domain_name = "config_acrobot.json" elif domain == 'hiv': if config_suffix is not None: domain_name = "config_hiv{}.json".format(config_suffix) else: domain_name = "config_hiv.json" elif domain == 'mujoco': domain_name = "config_mujoco.json" elif domain == 'cancer': domain_name = "config_cancer.json" else: raise ValueError("train.py : domain not recognized") with open(domain_name) as f: config_domain = json.load(f) n_state = config_domain['n_state'] n_action = config_domain['n_action'] min_samples_before_train = config_domain['min_samples_before_train'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) N_instances = config_main['N_instances'] N_episodes_per_instance = config_main['N_episodes'] # Give EPOpt the same number of total experiences as other methods N_episodes = N_instances * N_episodes_per_instance period = config_main['period'] dir_name = config_main['dir_name'] model_name = config_main['model_name'] os.makedirs('../results/%s' % dir_name, exist_ok=True) # Instantiate HPMDP hpmdp = HiPMDP.HiPMDP(domain, config_domain) # Instantiate control policy pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], 0) epsilon_start = config_DDQN['epsilon_start'] epsilon_end = config_DDQN['epsilon_end'] epsilon_decay = np.exp(np.log(epsilon_end / epsilon_start) / (N_episodes)) steps_per_train = config_DDQN['steps_per_train'] # TF session config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) sess.run(pi_c.list_initialize_target_ops) epsilon = epsilon_start writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph) saver = tf.train.Saver() # number of episodes that will be stored into replay buffer, # accounting for the epsilon-percentile filtering effective_num_episodes = int(N_episodes / 2.0 + N_episodes / 2.0 * epsilon_epopt) # use the DQN version of the replay, so instance_count and bnn-specific params do not matter exp_replay_param = { 'episode_count': effective_num_episodes, 'instance_count': 0, 'max_task_examples': hpmdp.max_steps_per_episode, 'ddqn_batch_size': config_DDQN['batch_size'], 'num_strata_samples': config_PER['num_strata_samples'], 'PER_alpha': config_PER['alpha'], 'PER_beta_zero': config_PER['beta_zero'], 'bnn_batch_size': 0, 'bnn_start': 0, 'dqn_start': min_samples_before_train } buf = ExperienceReplay.ExperienceReplay( exp_replay_param, buffer_size=config_PER['buffer_size']) # Logging header = "Episode,R_avg\n" with open("../results/%s/log.csv" % dir_name, 'w') as f: f.write(header) reward_period = 0 control_step = 0 train_count_control = 1 idx_episode = 0 summarize = False t_start = time.time() # Each iteration is one EPOpt iteration while idx_episode < N_episodes: instance_rollouts = [] instance_total_rewards = [] # Number of training steps that would have been done by online DDQN # during all of the episodes experienced by EPOpt expected_train_steps = 0 # Collect many episodes, HPs are reset every episode for idx_rollout in range(1, n_epopt + 1): # This increment of the counter for the outer while loop is intentional. # This ensures EPOpt experiences the same number of episodes as other methods idx_episode += 1 hpmdp.switch_instance() state = hpmdp.reset() done = False reward_episode = 0 traj = [] count_train_steps = 0 while not done: # Use DDQN with prioritized replay for this action = pi_c.run_actor(state, None, sess, epsilon) state_next, reward, done = hpmdp.step(action) control_step += 1 reward_episode += reward traj.append( np.reshape( np.array([state, action, reward, state_next, done]), (1, 5))) state = state_next if control_step >= min_samples_before_train and control_step % steps_per_train == 0: count_train_steps += 1 instance_rollouts.append(traj) instance_total_rewards.append(reward_episode) expected_train_steps += count_train_steps reward_period += reward_episode if epsilon > epsilon_end: epsilon *= epsilon_decay # Logging if idx_episode % period == 0: s = "%d,%.2f\n" % (idx_episode, reward_period / float(period)) print(s) with open("../results/%s/log.csv" % dir_name, 'a') as f: f.write(s) reward_period = 0 summarize = True if idx_episode < int(N_episodes / 2.0): # transitions from all trajectories will be stored percentile = 1.0 else: percentile = epsilon_epopt # Compute epsilon-percentile of cumulative reward # Only store transitions from trajectories in the lowest epsilon-percentile sorted_indices = np.argsort(instance_total_rewards) indices_selected = sorted_indices[0:int(n_epopt * percentile)] for idx_selected in indices_selected: for transition in instance_rollouts[idx_selected]: buf.add(transition) # Start control policy for idx_train in range(expected_train_steps): batch, IS_weights, indices = buf.sample(control_step) # Write TF summary at first train step after generating rollouts in which period was crossed td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize, writer) summarize = False train_count_control += 1 if config_PER['activate']: buf.update_priorities( np.hstack((np.reshape(td_loss, (len(td_loss), -1)), np.reshape(indices, (len(indices), -1))))) with open("../results/%s/time.txt" % dir_name, 'a') as f: f.write("%.5e" % (time.time() - t_start)) saver.save(sess, '../results/%s/%s' % (dir_name, model_name))