Example #1
0
def test_function(config, config_suffix=None):

    config_main = config['main']
    config_probe = config['probe']
    config_VAE = config['VAE']
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    config_ablation = config['ablation']
    use_pi_e = config_ablation['use_pi_e']
    phase = config_main['phase']
    assert (phase == 'validation' or phase == 'test')

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'mujoco':
        domain_name = "config_mujoco.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("test_ablation.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_domain['N_test_instances']
    N_episodes = config_domain['N_test_episodes']
    test_steps = config_domain['test_steps']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain, phase)

    # Instantiate probe policy
    n_probe_steps = config_domain['traj_length']
    assert (n_probe_steps < test_steps)
    if use_pi_e:
        pi_e = probe.Probe(config_probe, n_state, n_action)
    else:
        # initial z
        z_avg = pickle.load(open('../results/%s/z_avg.p' % dir_name, 'rb'))

    # Instantiate VAE
    buffer_size_vae = config_VAE['buffer_size']
    batch_size_vae = config_VAE['batch_size']
    del config_VAE['buffer_size']
    vae = vae_import.VAE(n_state,
                         n_action,
                         n_probe_steps,
                         seed=seed,
                         **config_VAE)

    # Instantiate control policy
    if config_DDQN['activate']:
        pi_c = ddqn.DDQN(config_DDQN, n_state, n_action,
                         config_PER['activate'], config_VAE['n_latent'])

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    saver = tf.train.Saver()
    print("Restoring variables from %s" % dir_name)
    saver.restore(sess, '../results/%s/%s' % (dir_name, model_name))

    reward_total = 0
    cumulative_reward = np.zeros((test_steps, N_instances))
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances + 1):

        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to",
              hpmdp.instance_param_set)

        # N_episodes should be 1, but we let it be flexible in case needed
        for idx_episode in range(1, N_episodes + 1):

            reward_episode = 0

            collected_probe_traj = False
            while not collected_probe_traj:

                # list of (state, action) pairs
                traj_probe = []
                state = hpmdp.reset()
                episode_step = 0
                done = False

                probe_finished_early = False
                # Generate probe trajectory
                for step in range(1, n_probe_steps + 1):

                    if use_pi_e:
                        action = pi_e.run_actor(state, sess)
                    else:
                        action = pi_c.run_actor(state, z_avg, sess, epsilon=0)
                    # print("Probe step %d action %d" % (step, action))
                    action_1hot = np.zeros(n_action)
                    action_1hot[action] = 1
                    traj_probe.append((state, action_1hot))
                    state_next, reward, done = hpmdp.step(action)
                    reward_episode += reward
                    cumulative_reward[episode_step,
                                      idx_instance - 1] = reward_episode
                    state = state_next
                    episode_step += 1
                    if done and step < n_probe_steps:
                        probe_finished_early = True
                        print(
                            "test_ablation.py : done is True while generating probe trajectory"
                        )
                        break

                if not probe_finished_early:
                    collected_probe_traj = True

            # Use VAE to estimate hidden parameter
            z = vae.encode(sess, traj_probe)

            print(z)

            if config_DDQN['activate']:
                # Start control policy
                while not done and episode_step < test_steps:
                    # Use DDQN with prioritized replay for this
                    action = pi_c.run_actor(state, z, sess, epsilon=0)
                    state_next, reward, done = hpmdp.step(action)
                    reward_episode += reward
                    cumulative_reward[episode_step,
                                      idx_instance - 1] = reward_episode
                    state = state_next
                    episode_step += 1
                print(reward_episode)
                # If episode ended earlier than test_steps, fill in the
                # rest of the cumulative rewards with the last value
                if episode_step < test_steps:
                    remaining = np.ones(test_steps -
                                        episode_step) * reward_episode
                    cumulative_reward[episode_step:,
                                      idx_instance - 1] = remaining

                reward_total += reward_episode

    header = 'Step'
    for idx in range(1, N_instances + 1):
        header += ',R_%d' % idx
    indices = np.arange(1, test_steps + 1).reshape(test_steps, 1)
    concated = np.concatenate([indices, cumulative_reward], axis=1)
    save_loc = '_'.join(dir_name.split('_')[:-1])
    os.makedirs('../results/%s' % save_loc, exist_ok=True)
    run_number = dir_name.split('_')[-1]
    np.savetxt('../results/%s/test_%s.csv' % (save_loc, run_number),
               concated,
               delimiter=',',
               fmt='%.3e',
               header=header)

    print("Avg episode reward", reward_total / float(N_instances * N_episodes))
Example #2
0
def train_function(config, config_suffix=None):

    # with open('config.json') as f:
    #     config = json.load(f)
    config_main = config['main']
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    config_baseline = config['baseline']
    
    real_z_input = config_baseline['real_z_input']
    if real_z_input:
        # If use real hidden param as input, then of course DDQN must accept z
        assert(config_DDQN['z_input']==True)
    
    domain = config_main['domain']
    
    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'lander':
        domain_name = "config_lander.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("train_baseline.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)
    
    n_state = config_domain['n_state']
    n_action = config_domain['n_action']
    n_hidden = config_domain['n_hidden'] # dimension of real hidden param
    
    min_samples_before_train = config_domain['min_samples_before_train']
    
    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)
    
    N_instances = config_main['N_instances']
    N_episodes = config_main['N_episodes']
    period = config_main['period']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']
    
    os.makedirs('../results/%s'%dir_name, exist_ok=True)
    
    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain)
    
    # Instantiate control policy
    pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], n_hidden)
    epsilon_start = config_DDQN['epsilon_start']
    epsilon_end = config_DDQN['epsilon_end']
    epsilon_decay = np.exp(np.log(epsilon_end/epsilon_start)/(N_instances*N_episodes))
    # epsilon_decay = config_DDQN['epsilon_decay']
    steps_per_train = config_DDQN['steps_per_train']
    
    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())
    
    sess.run(pi_c.list_initialize_target_ops)
    
    writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)
    
    saver = tf.train.Saver()
    
    # use the DQN version of the replay, so instance_count and bnn-specific params do not matter
    exp_replay_param = {'episode_count':N_instances*N_episodes,
                        'instance_count':0, 'max_task_examples':hpmdp.max_steps_per_episode,
                        'ddqn_batch_size':config_DDQN['batch_size'],
                        'num_strata_samples':config_PER['num_strata_samples'],
                        'PER_alpha':config_PER['alpha'],
                        'PER_beta_zero':config_PER['beta_zero'],
                        'bnn_batch_size':0, 'bnn_start':0,
                        'dqn_start':min_samples_before_train}
                        
    buf = ExperienceReplay.ExperienceReplay(exp_replay_param, buffer_size=config_PER['buffer_size'])
    
    # Logging
    header = "Episode,R_avg\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    reward_period = 0
    
    epsilon = epsilon_start
    control_step = 0
    train_count_control = 1
    total_episodes = 0
    t_start = time.time()
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances+1):
    
        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to", hpmdp.instance_param_set)
        if real_z_input:
            z = hpmdp.get_real_hidden_param()
    
        # Iterate through many episodes
        for idx_episode in range(1, N_episodes+1):
    
            total_episodes += 1
    
            # print("Episode", idx_episode)
            state = hpmdp.reset()
            done = False
            summarized = False
            reward_episode = 0
    
            # Start control policy
            while not done:
                # Use DDQN with prioritized replay for this
                if real_z_input:
                    action = pi_c.run_actor(state, z, sess, epsilon)
                else:
                    action = pi_c.run_actor(state, None, sess, epsilon)
                state_next, reward, done = hpmdp.step(action)
                control_step += 1
                reward_episode += reward
    
                if real_z_input:
                    buf.add(np.reshape(np.array([state,action,reward,state_next,done,z]), (1,6)))
                else:
                    buf.add(np.reshape(np.array([state,action,reward,state_next,done]), (1,5)))
                state = state_next
    
                if control_step >= min_samples_before_train and control_step % steps_per_train == 0:
                   batch, IS_weights, indices = buf.sample(control_step)
                   
                   if (total_episodes % period == 0) and not summarized:
                       # Write TF summary at first train step of the last episode of every instance
                       td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize=True, writer=writer)
                       summarized = True
                   else:
                       td_loss = pi_c.train_step(sess, batch, IS_weights, indices, train_count_control, summarize=False, writer=writer)
                   train_count_control += 1
    
                   if config_PER['activate']:
                       buf.update_priorities( np.hstack( (np.reshape(td_loss, (len(td_loss),-1)), np.reshape(indices, (len(indices),-1))) ) )
    
            reward_period += reward_episode
    
            if epsilon > epsilon_end:
                epsilon *= epsilon_decay
    
            # Logging
            if total_episodes % period == 0:
                s = "%d,%.2f\n" % (total_episodes, reward_period/float(period))
                print(s)
                with open("../results/%s/log.csv" % dir_name, 'a') as f:
                    f.write(s)
                reward_period = 0
                
    with open("../results/%s/time.txt" % dir_name, 'a') as f:
        f.write("%.5e" % (time.time() - t_start))
    
    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
Example #3
0
def test_function(config, config_suffix=None):

    config_main = config['main']
    config_VAE = config['VAE']
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    phase = config_main['phase']
    assert (phase == 'validation' or phase == 'test')

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'mujoco':
        domain_name = "config_mujoco.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("train.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_domain['N_test_instances']
    N_episodes = config_domain['N_test_episodes']
    test_steps = config_domain['test_steps']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain, phase)

    # Length of trajectory for input to VAE
    n_vae_steps = config_domain['traj_length']
    n_latent = config_VAE['n_latent']
    z = np.zeros(config_VAE['n_latent'], dtype=np.float32)

    with open('../results/%s/std_max.pkl' % dir_name, 'rb') as f:
        std_max = pickle.load(f)

    # Instantiate VAE
    buffer_size_vae = config_VAE['buffer_size']
    batch_size_vae = config_VAE['batch_size']
    del config_VAE['buffer_size']
    vae = vae_import.VAE(n_state,
                         n_action,
                         n_vae_steps,
                         seed=seed,
                         **config_VAE)

    # Instantiate control policy
    if config_DDQN['activate']:
        pi_c = ddqn.DDQN(config_DDQN, n_state, n_action,
                         config_PER['activate'], config_VAE['n_latent'])

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    saver = tf.train.Saver()
    print("Restoring variables from %s" % dir_name)
    saver.restore(sess, '../results/%s/%s' % (dir_name, model_name))

    reward_total = 0
    cumulative_reward = np.zeros((test_steps, N_instances))
    list_times = []
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances + 1):

        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to",
              hpmdp.instance_param_set)

        t_start = time.time()
        for idx_episode in range(1, N_episodes + 1):

            # rolling window of (state, action) pairs
            traj_for_vae = []
            eta = 1.0  # range [0,1] 1 means the policy should act to maximize probe reward
            z = np.zeros(config_VAE['n_latent'], dtype=np.float32)
            reward_episode = 0
            state = hpmdp.reset()
            episode_step = 0
            done = False

            while not done and episode_step < test_steps:

                action = pi_c.run_actor(state, z, sess, epsilon=0, eta=eta)
                action_1hot = np.zeros(n_action)
                action_1hot[action] = 1
                traj_for_vae.append((state, action_1hot))
                if len(traj_for_vae) == n_vae_steps + 1:
                    traj_for_vae = traj_for_vae[1:]

                state_next, reward, done = hpmdp.step(action)

                reward_episode += reward
                cumulative_reward[episode_step,
                                  idx_instance - 1] = reward_episode

                # Get z_next and eta_next, because they are considered part of the augmented MDP state
                if len(traj_for_vae) == n_vae_steps:
                    std = vae.get_std(sess, traj_for_vae)
                    std = std / std_max  # element-wise normalization, now each element is between [0,1]
                    eta_next = np.sum(std) / n_latent  # scalar between [0,1]
                    eta_next = min(
                        1.0, eta_next
                    )  # in case std_max during training isn't large enough
                    # Use VAE to update hidden parameter
                    z_next = vae.encode(sess, traj_for_vae)
                else:
                    z_next = z
                    eta_next = eta

                state = state_next
                eta = eta_next
                z = z_next
                episode_step += 1

            # If episode ended earlier than test_steps, fill in the
            # rest of the cumulative rewards with the last value
            if episode_step < test_steps:
                remaining = np.ones(test_steps - episode_step) * reward_episode
                cumulative_reward[episode_step:, idx_instance - 1] = remaining

            reward_total += reward_episode

        list_times.append(time.time() - t_start)

    header = 'Step'
    for idx in range(1, N_instances + 1):
        header += ',R_%d' % idx
    indices = np.arange(1, test_steps + 1).reshape(test_steps, 1)
    concated = np.concatenate([indices, cumulative_reward], axis=1)
    save_loc = '_'.join(dir_name.split('_')[:-1])
    os.makedirs('../results/%s' % save_loc, exist_ok=True)
    run_number = dir_name.split('_')[-1]
    np.savetxt('../results/%s/test_%s.csv' % (save_loc, run_number),
               concated,
               delimiter=',',
               fmt='%.3e',
               header=header)

    with open('../results/%s/test_time_%s.pkl' % (save_loc, run_number),
              'wb') as f:
        pickle.dump(list_times, f)

    print("Avg episode reward", reward_total / float(N_instances * N_episodes))
Example #4
0
def train_function(config, config_suffix=None):

    config_main = config['main']
    config_probe = config['probe']
    autoencoder = config_main['autoencoder']
    if autoencoder == 'VAE':
        config_VAE = config['VAE']
    else:
        raise ValueError("Other autoencoders not supported")
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    phase = config_main['phase']
    assert (phase == 'train')

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'lander':
        domain_name = "config_lander.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("train.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']
    min_samples_before_train = config_domain['min_samples_before_train']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_main['N_instances']
    N_episodes = config_main['N_episodes']
    period = config_main['period']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    os.makedirs('../results/%s' % dir_name, exist_ok=True)

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain)

    # Instantiate probe policy
    n_probe_steps = config_domain['traj_length']
    pi_e = probe.Probe(config_probe, n_state, n_action)

    # Instantiate VAE
    buffer_size_vae = config_VAE['buffer_size']
    batch_size_vae = config_VAE['batch_size']
    del config_VAE['buffer_size']
    if autoencoder == 'VAE':
        vae = vae_import.VAE(n_state,
                             n_action,
                             n_probe_steps,
                             seed=seed,
                             **config_VAE)
    else:
        raise ValueError('Other autoencoders not supported')

    # Instantiate control policy
    if config_DDQN['activate']:
        pi_c = ddqn.DDQN(config_DDQN, n_state, n_action,
                         config_PER['activate'], config_VAE['n_latent'])
        epsilon_start = config_DDQN['epsilon_start']
        epsilon_end = config_DDQN['epsilon_end']
        epsilon_decay = np.exp(
            np.log(epsilon_end / epsilon_start) / (N_instances * N_episodes))
        steps_per_train = config_DDQN['steps_per_train']

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    if config_DDQN['activate']:
        sess.run(pi_c.list_initialize_target_ops)
        epsilon = epsilon_start

    if config_VAE['dual']:
        sess.run(vae.list_equate_dual_ops)

    writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)

    saver = tf.train.Saver()

    # use the DQN version of the replay, so instance_count and bnn-specific params do not matter
    exp_replay_param = {
        'episode_count': N_instances * N_episodes,
        'instance_count': 0,
        'max_task_examples': hpmdp.max_steps_per_episode,
        'ddqn_batch_size': config_DDQN['batch_size'],
        'num_strata_samples': config_PER['num_strata_samples'],
        'PER_alpha': config_PER['alpha'],
        'PER_beta_zero': config_PER['beta_zero'],
        'bnn_batch_size': 0,
        'bnn_start': 0,
        'dqn_start': min_samples_before_train
    }

    buf = ExperienceReplay.ExperienceReplay(
        exp_replay_param, buffer_size=config_PER['buffer_size'])

    # Logging
    header = "Episode,R_avg,R_p\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    reward_period = 0
    reward_p_period = 0

    list_trajs = []  # circular buffer to store probe trajectories for VAE
    idx_traj = 0  # counter for list_trajs
    control_step = 0
    train_count_probe = 1
    train_count_vae = 1
    train_count_control = 1
    total_episodes = 0
    t_start = time.time()
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances + 1):

        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to",
              hpmdp.instance_param_set)

        # Iterate through many episodes
        for idx_episode in range(1, N_episodes + 1):

            total_episodes += 1

            # list of (state, action) pairs
            traj_probe = []
            state = hpmdp.reset()
            done = False
            reward_episode = 0

            # Generate probe trajectory
            probe_finished_early = False
            for step in range(1, n_probe_steps + 1):

                action = pi_e.run_actor(state, sess)
                action_1hot = np.zeros(n_action)
                action_1hot[action] = 1
                traj_probe.append((state, action_1hot))
                state_next, reward, done = hpmdp.step(action)
                state = state_next
                reward_episode += reward

                if done and step < n_probe_steps:
                    probe_finished_early = True
                    print(
                        "train.py : done is True while generating probe trajectory"
                    )
                    break

            if probe_finished_early:
                # Skip over pi_e and VAE training if probe finished early
                continue

            if idx_traj >= len(list_trajs):
                list_trajs.append(traj_probe)
            else:
                list_trajs[idx_traj] = traj_probe
            idx_traj = (idx_traj + 1) % buffer_size_vae

            # Compute probe reward using VAE
            if config_probe['reward'] == 'vae':
                reward_e = vae.compute_lower_bound(traj_probe, sess)
            elif config_probe['reward'] == 'total_variation':
                reward_e = pi_e.compute_reward(traj_probe)
            elif config_probe['reward'] == 'negvae':
                # this reward encourages maximizing entropy
                reward_e = -vae.compute_lower_bound(traj_probe, sess)

            # Write Tensorboard at the final episode of every instance
            if total_episodes % period == 0:
                summarize = True
            else:
                summarize = False

            # Train probe policy
            pi_e.train_step(sess, traj_probe, reward_e, train_count_probe,
                            summarize, writer)
            train_count_probe += 1

            # Train VAE
            if len(list_trajs) >= batch_size_vae:
                vae.train_step(sess, list_trajs, train_count_vae, summarize,
                               writer)
                train_count_vae += 1

            # Use VAE to estimate hidden parameter
            z = vae.encode(sess, traj_probe)

            if config_DDQN['activate']:
                # Start control policy
                summarized = False
                while not done:
                    # Use DDQN with prioritized replay for this
                    action = pi_c.run_actor(state, z, sess, epsilon)
                    state_next, reward, done = hpmdp.step(action)
                    control_step += 1
                    reward_episode += reward

                    buf.add(
                        np.reshape(
                            np.array(
                                [state, action, reward, state_next, done, z]),
                            (1, 6)))
                    state = state_next

                    if control_step >= min_samples_before_train and control_step % steps_per_train == 0:
                        batch, IS_weights, indices = buf.sample(control_step)
                        if not summarized:
                            # Write TF summary at first train step of the last episode of every instance
                            td_loss = pi_c.train_step(sess, batch, IS_weights,
                                                      indices,
                                                      train_count_control,
                                                      summarize, writer)
                            summarized = True
                        else:
                            td_loss = pi_c.train_step(sess, batch, IS_weights,
                                                      indices,
                                                      train_count_control,
                                                      False, writer)
                        train_count_control += 1

                        if config_PER['activate']:
                            buf.update_priorities(
                                np.hstack(
                                    (np.reshape(td_loss, (len(td_loss), -1)),
                                     np.reshape(indices, (len(indices), -1)))))

                reward_period += reward_episode
                reward_p_period += reward_e

                if epsilon > epsilon_end:
                    epsilon *= epsilon_decay

                # Logging
                if total_episodes % period == 0:
                    s = "%d,%.2f,%.2f\n" % (total_episodes,
                                            reward_period / float(period),
                                            reward_p_period / float(period))
                    print(s)
                    with open("../results/%s/log.csv" % dir_name, 'a') as f:
                        f.write(s)
                    if config_domain[
                            'save_threshold'] and reward_period / float(
                                period) > config_domain['save_threshold']:
                        saver.save(
                            sess, '../results/%s/%s.%d' %
                            (dir_name, model_name, total_episodes))
                    reward_period = 0
                    reward_p_period = 0

    with open("../results/%s/time.txt" % dir_name, 'a') as f:
        f.write("%.5e" % (time.time() - t_start))

    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
Example #5
0
def train_function(config, config_suffix=None):

    config_main = config['main']
    config_VAE = config['VAE']
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    config_ablation = config['ablation']
    eq_rew = config_ablation['equalize_reward']

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'mujoco':
        domain_name = "config_mujoco.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("train.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']
    min_samples_before_train = config_domain['min_samples_before_train']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_main['N_instances']
    N_episodes = config_main['N_episodes']
    period = config_main['period']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    os.makedirs('../results/%s' % dir_name, exist_ok=True)

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain)

    # Length of trajectory for input to VAE
    n_vae_steps = config_domain['traj_length']
    n_latent = config_VAE['n_latent']
    z = np.zeros(config_VAE['n_latent'], dtype=np.float32)
    eta = 1.0  # range [0,1] 1 means the policy should act to maximize probe reward
    std_max = -np.inf * np.ones(config_VAE['n_latent'], dtype=np.float32)

    # Instantiate VAE
    buffer_size_vae = config_VAE['buffer_size']
    batch_size_vae = config_VAE['batch_size']
    del config_VAE['buffer_size']
    vae = vae_import.VAE(n_state,
                         n_action,
                         n_vae_steps,
                         seed=seed,
                         **config_VAE)

    # Instantiate control policy
    if config_DDQN['activate']:
        pi_c = ddqn.DDQN(config_DDQN, n_state, n_action,
                         config_PER['activate'], config_VAE['n_latent'])
        epsilon_start = config_DDQN['epsilon_start']
        epsilon_end = config_DDQN['epsilon_end']
        epsilon_decay = np.exp(
            np.log(epsilon_end / epsilon_start) / (N_episodes * N_instances))
        steps_per_train = config_DDQN['steps_per_train']

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    if config_DDQN['activate']:
        sess.run(pi_c.list_initialize_target_ops)
        epsilon = epsilon_start

    if config_VAE['dual']:
        sess.run(vae.list_equate_dual_ops)

    writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)

    saver = tf.train.Saver()

    # use the DQN version of the replay, so instance_count and bnn-specific params do not matter
    exp_replay_param = {
        'episode_count': N_instances * N_episodes,
        'instance_count': 0,
        'max_task_examples': hpmdp.max_steps_per_episode,
        'ddqn_batch_size': config_DDQN['batch_size'],
        'num_strata_samples': config_PER['num_strata_samples'],
        'PER_alpha': config_PER['alpha'],
        'PER_beta_zero': config_PER['beta_zero'],
        'bnn_batch_size': 0,
        'bnn_start': 0,
        'dqn_start': min_samples_before_train
    }

    buf = ExperienceReplay.ExperienceReplay(
        exp_replay_param, buffer_size=config_PER['buffer_size'])

    # running mean and variance of MDP reward and VAE lowerbound
    if eq_rew:
        stat_counter = 0
        r_mdp_mean = 0
        r_mdp_var = 0
        r_probe_mean = 0
        r_probe_var = 0

    # Logging
    header = "Episode,R_avg,R_e_avg\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    reward_period = 0
    reward_e_period = 0

    list_trajs = []  # circular buffer to store probe trajectories for VAE
    idx_traj = 0  # counter for list_trajs
    control_step = 0
    train_count_vae = 1
    train_count_control = 1
    total_episodes = 0
    t_start = time.time()
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances + 1):

        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to",
              hpmdp.instance_param_set)

        # Iterate through many episodes
        for idx_episode in range(1, N_episodes + 1):

            total_episodes += 1

            eta = 1.0
            z = np.zeros(config_VAE['n_latent'], dtype=np.float32)
            if total_episodes % period == 0:
                list_eta = [eta]

            # rolling window of (state, action) pairs
            traj_for_vae = []
            state = hpmdp.reset()
            done = False
            reward_episode = 0
            reward_e_episode = 0
            step_episode = 0

            if total_episodes % period == 0:
                summarize = True
            else:
                summarize = False

            summarized = False
            while not done:

                action = pi_c.run_actor(state, z, sess, epsilon, eta)
                control_step += 1
                action_1hot = np.zeros(n_action)
                action_1hot[action] = 1
                traj_for_vae.append((state, action_1hot))
                if len(traj_for_vae) == n_vae_steps + 1:
                    traj_for_vae = traj_for_vae[1:]

                state_next, reward, done = hpmdp.step(action)
                step_episode += 1

                if eq_rew:
                    stat_counter += 1
                    # update MDP reward mean and var
                    r_mdp_mean_prev = r_mdp_mean
                    r_mdp_mean = 1 / float(stat_counter) * reward + (
                        stat_counter - 1) / float(stat_counter) * r_mdp_mean
                    r_mdp_var = r_mdp_var + (reward - r_mdp_mean_prev) * (
                        reward - r_mdp_mean)

                if len(traj_for_vae) == n_vae_steps:
                    # Compute probe reward using VAE
                    reward_e = vae.compute_lower_bound(traj_for_vae, sess)[0]

                    if eq_rew:
                        # Update probe reward mean and var
                        r_probe_mean_prev = r_probe_mean
                        r_probe_mean = 1 / float(stat_counter) * reward_e + (
                            stat_counter -
                            1) / float(stat_counter) * r_probe_mean
                        r_probe_var = r_probe_var + (
                            reward_e - r_probe_mean_prev) * (reward_e -
                                                             r_probe_mean)
                        # Scale probe reward into MDP reward
                        reward_e = (
                            (reward_e - r_probe_mean) /
                            np.sqrt(r_probe_var / stat_counter) +
                            r_mdp_mean) * np.sqrt(r_mdp_var / stat_counter)

                    reward_total = eta * reward_e + (1 - eta) * reward
                else:
                    reward_e = 0.0
                    reward_total = reward

                # Get z_next and eta_next, because they are considered part of the augmented MDP state
                if len(traj_for_vae) == n_vae_steps:
                    std = vae.get_std(sess, traj_for_vae)
                    # Update max
                    for idx in range(n_latent):
                        if std[idx] >= std_max[idx]:
                            std_max[idx] = std[idx]
                    std = std / std_max  # element-wise normalization, now each element is between [0,1]
                    eta_next = np.sum(std) / n_latent  # scalar between [0,1]
                    # Use VAE to update hidden parameter
                    z_next = vae.encode(sess, traj_for_vae)
                else:
                    z_next = z
                    eta_next = eta

                if total_episodes % period == 0:
                    list_eta.append(eta_next)

                # Use total reward to train policy
                buf.add(
                    np.reshape(
                        np.array([
                            state, z, eta, action, reward_total, state_next,
                            z_next, eta_next, done
                        ]), (1, 9)))
                state = state_next
                eta = eta_next
                z = z_next

                # Note that for evaluation purpose we record the MDP reward separately
                reward_episode += reward
                reward_e_episode += reward_e

                # Store non-overlapping trajectories for training VAE
                # if len(traj_for_vae) == n_vae_steps:
                if step_episode % n_vae_steps == 0:
                    if idx_traj >= len(list_trajs):
                        list_trajs.append(
                            list(traj_for_vae))  # must make a new list
                    else:
                        list_trajs[idx_traj] = list(traj_for_vae)
                    idx_traj = (idx_traj + 1) % buffer_size_vae

                if control_step >= min_samples_before_train and control_step % steps_per_train == 0:
                    batch, IS_weights, indices = buf.sample(control_step)
                    if not summarized:
                        # Write TF summary at first train step of the last episode of every instance
                        td_loss = pi_c.train_step(sess, batch, IS_weights,
                                                  indices, train_count_control,
                                                  summarize, writer)
                        summarized = True
                    else:
                        td_loss = pi_c.train_step(sess, batch, IS_weights,
                                                  indices, train_count_control,
                                                  False, writer)
                    train_count_control += 1

                    if config_PER['activate']:
                        buf.update_priorities(
                            np.hstack((np.reshape(td_loss, (len(td_loss), -1)),
                                       np.reshape(indices,
                                                  (len(indices), -1)))))

            reward_period += reward_episode
            reward_e_period += reward_e_episode

            if epsilon > epsilon_end:
                epsilon *= epsilon_decay

            # Train VAE at the end of each episode
            if len(list_trajs) >= batch_size_vae:
                vae.train_step(sess, list_trajs, train_count_vae, summarize,
                               writer)
                train_count_vae += 1

            # Logging
            if total_episodes % period == 0:
                s = "%d,%.2f,%.2f\n" % (total_episodes,
                                        reward_period / float(period),
                                        reward_e_period / float(period))
                print(s)
                with open("../results/%s/log.csv" % dir_name, 'a') as f:
                    f.write(s)
                with open("../results/%s/eta.csv" % dir_name, 'a') as f:
                    eta_string = ','.join(['%.2f' % x for x in list_eta])
                    eta_string += '\n'
                    f.write(eta_string)
                if config_domain['save_threshold'] and reward_period / float(
                        period) > config_domain['save_threshold']:
                    saver.save(
                        sess, '../results/%s/%s.%d' %
                        (dir_name, model_name, total_episodes))
                reward_period = 0
                reward_e_period = 0

    with open("../results/%s/time.txt" % dir_name, 'a') as f:
        f.write("%.5e" % (time.time() - t_start))

    with open('../results/%s/std_max.pkl' % dir_name, 'wb') as f:
        pickle.dump(std_max, f)

    if eq_rew:
        reward_scaling = np.array([
            r_mdp_mean,
            np.sqrt(r_mdp_var / stat_counter), r_probe_mean,
            np.sqrt(r_probe_var / stat_counter)
        ])
        with open('../results/%s/reward_scaling.pkl' % dir_name, 'wb') as f:
            pickle.dump(reward_scaling, f)

    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
Example #6
0
      s= env.reset()
    return r, np.array(s_), fin
    
def reward2(s, a):
    global i_epi
    env.render()
    i_epi += 1
    s_, r, fin, info = env.step(a)
    r += (1 - abs(s[2]))
    r += 1 - abs(s[0])
    # r+=float(i_epi)/200
    r = (r-3.0)*5

    # print(np.array(s_))
    if fin == 1:
      if i_epi < 100:
          r += -1
      i_epi = 0
      plt.pause(0.01)
      s= env.reset()
    return r, np.array(s_), fin

Qnet = ddqn.Net(4,2)
td = ddqn.DDQN(4,np.array([0,1]),np.array(env.reset()),reward2,0.2, 200,Qnet)
td.learn()
print(td.Q)

env.close()


Example #7
0
def train_function(config, config_suffix=None):

    config_main = config['main']
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    assert (config['baseline']['epopt'] == True)
    assert (config_DDQN['activate'] == True)

    n_epopt = config['epopt']['n_epopt']
    epsilon_epopt = config['epopt']['epsilon']

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'mujoco':
        domain_name = "config_mujoco.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("train.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']
    min_samples_before_train = config_domain['min_samples_before_train']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_main['N_instances']
    N_episodes_per_instance = config_main['N_episodes']
    # Give EPOpt the same number of total experiences as other methods
    N_episodes = N_instances * N_episodes_per_instance
    period = config_main['period']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    os.makedirs('../results/%s' % dir_name, exist_ok=True)

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain)

    # Instantiate control policy
    pi_c = ddqn.DDQN(config_DDQN, n_state, n_action, config_PER['activate'], 0)
    epsilon_start = config_DDQN['epsilon_start']
    epsilon_end = config_DDQN['epsilon_end']
    epsilon_decay = np.exp(np.log(epsilon_end / epsilon_start) / (N_episodes))
    steps_per_train = config_DDQN['steps_per_train']

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    sess.run(pi_c.list_initialize_target_ops)
    epsilon = epsilon_start

    writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)

    saver = tf.train.Saver()

    # number of episodes that will be stored into replay buffer,
    # accounting for the epsilon-percentile filtering
    effective_num_episodes = int(N_episodes / 2.0 +
                                 N_episodes / 2.0 * epsilon_epopt)
    # use the DQN version of the replay, so instance_count and bnn-specific params do not matter
    exp_replay_param = {
        'episode_count': effective_num_episodes,
        'instance_count': 0,
        'max_task_examples': hpmdp.max_steps_per_episode,
        'ddqn_batch_size': config_DDQN['batch_size'],
        'num_strata_samples': config_PER['num_strata_samples'],
        'PER_alpha': config_PER['alpha'],
        'PER_beta_zero': config_PER['beta_zero'],
        'bnn_batch_size': 0,
        'bnn_start': 0,
        'dqn_start': min_samples_before_train
    }

    buf = ExperienceReplay.ExperienceReplay(
        exp_replay_param, buffer_size=config_PER['buffer_size'])

    # Logging
    header = "Episode,R_avg\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    reward_period = 0

    control_step = 0
    train_count_control = 1
    idx_episode = 0
    summarize = False
    t_start = time.time()
    # Each iteration is one EPOpt iteration
    while idx_episode < N_episodes:

        instance_rollouts = []
        instance_total_rewards = []
        # Number of training steps that would have been done by online DDQN
        # during all of the episodes experienced by EPOpt
        expected_train_steps = 0
        # Collect many episodes, HPs are reset every episode
        for idx_rollout in range(1, n_epopt + 1):

            # This increment of the counter for the outer while loop is intentional.
            # This ensures EPOpt experiences the same number of episodes as other methods
            idx_episode += 1

            hpmdp.switch_instance()
            state = hpmdp.reset()
            done = False
            reward_episode = 0
            traj = []
            count_train_steps = 0

            while not done:
                # Use DDQN with prioritized replay for this
                action = pi_c.run_actor(state, None, sess, epsilon)
                state_next, reward, done = hpmdp.step(action)
                control_step += 1
                reward_episode += reward

                traj.append(
                    np.reshape(
                        np.array([state, action, reward, state_next, done]),
                        (1, 5)))

                state = state_next

                if control_step >= min_samples_before_train and control_step % steps_per_train == 0:
                    count_train_steps += 1

            instance_rollouts.append(traj)
            instance_total_rewards.append(reward_episode)
            expected_train_steps += count_train_steps
            reward_period += reward_episode

            if epsilon > epsilon_end:
                epsilon *= epsilon_decay

            # Logging
            if idx_episode % period == 0:
                s = "%d,%.2f\n" % (idx_episode, reward_period / float(period))
                print(s)
                with open("../results/%s/log.csv" % dir_name, 'a') as f:
                    f.write(s)
                reward_period = 0
                summarize = True

        if idx_episode < int(N_episodes / 2.0):
            # transitions from all trajectories will be stored
            percentile = 1.0
        else:
            percentile = epsilon_epopt

        # Compute epsilon-percentile of cumulative reward
        # Only store transitions from trajectories in the lowest epsilon-percentile
        sorted_indices = np.argsort(instance_total_rewards)
        indices_selected = sorted_indices[0:int(n_epopt * percentile)]
        for idx_selected in indices_selected:
            for transition in instance_rollouts[idx_selected]:
                buf.add(transition)

        # Start control policy
        for idx_train in range(expected_train_steps):
            batch, IS_weights, indices = buf.sample(control_step)
            # Write TF summary at first train step after generating rollouts in which period was crossed
            td_loss = pi_c.train_step(sess, batch, IS_weights, indices,
                                      train_count_control, summarize, writer)
            summarize = False
            train_count_control += 1

            if config_PER['activate']:
                buf.update_priorities(
                    np.hstack((np.reshape(td_loss, (len(td_loss), -1)),
                               np.reshape(indices, (len(indices), -1)))))

    with open("../results/%s/time.txt" % dir_name, 'a') as f:
        f.write("%.5e" % (time.time() - t_start))

    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))