コード例 #1
0
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size="normal",
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.0,
        dynamics_bonus=False,
        meta_rl=False,
    ):
        StochasticPolicy.__init__(self,
                                  scope,
                                  ob_space,
                                  ac_space,
                                  meta_rl=meta_rl)
        self.proportion_of_exp_used_for_predictor_update = (
            proportion_of_exp_used_for_predictor_update)
        enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu,
        )
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name="state")
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        (
            self.pdparam_opt,
            self.vpred_int_opt,
            self.vpred_ext_opt,
            self.snext_opt,
        ) = self.apply_policy(
            self.ph_ob['obs'][:, :-1],
            reuse=False,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps - 1,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        (
            self.pdparam_rollout,
            self.vpred_int_rollout,
            self.vpred_ext_rollout,
            self.snext_rollout,
        ) = self.apply_policy(
            self.ph_ob['obs'],
            reuse=True,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
コード例 #2
0
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size='normal',
        maxpool=False,
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.,
        dynamics_bonus=False,
    ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement  #256
        hidsize *= enlargement  #256
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)
            self.step_prediction(convfeat=convfeat,
                                 rep_size=rep_size,
                                 enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
コード例 #3
0
    def __init__(self, scope, ob_space, ac_space,
                 policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 exploration_type='bottleneck', beta=0.001, rew_counter=None
                 ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)

        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {
            'small': 1,
            'normal': 2,
            'large': 4
        }[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean")  # (84, 84, 1)
        self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd")    # (84, 84, 1)
        memsize *= enlargement          # memsize = 256
        hidsize *= enlargement          # hidsize = 256
        convfeat = 16*enlargement       # covfeat = 32
        self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state')  # (None,256)
        pdparamsize = self.pdtype.param_shape()[0]     # 18 等于动作维度
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately.
        
        # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256)
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,                  # 256
                              memsize=memsize,                  # 256
                              extrahid=extrahid,                # True
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize)           # 18
                              
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize)

        self.exploration_type = exploration_type
        self.max_table = 0

        self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)    # 输出策略 softmax 的分布.

        self.a_samp = pd.sample()                 # 采样动作
        self.nlp_samp = pd.neglogp(self.a_samp)   # 输出动作
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)
        self.a_samp_opt = self.pd_opt.sample()

        self.ph_istate = ph_istate

        self.scope = scope

        
        #############################################
        ########## 以下过程实际并未使用 ################
        #############################################
        # for gradcam policy
        a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)   # (None,None) -> (None,None,18)
        # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt
        loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot))  # (None,)
        
        self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0')
        self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0]
        # for gradcam aux
        loss_cam_aux = self.kl
        if int(str(tf.__version__).split('.')[1]) < 10:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0')
        else:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0')
        self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0])

        # self.cams 实际并未使用
        weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1)
        weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1)
        weights = tf.tile(weights, [1, 6, 6, 1])
        cams = tf.reduce_sum((weights * self.conv_out), axis=3)
        self.cams = tf.maximum(cams, tf.zeros_like(cams))

        # self.cans_aux 实际并未使用
        weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1)
        weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1)
        weights_aux = tf.tile(weights_aux, [1, 7, 7, 1])
        cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3))
        self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux))
コード例 #4
0
def main(env_name, headless):

    if headless:
        display = Display(visible=0, size=(1280, 1024))
        display.start()
    ###
    #paths

    model_dir = os.path.join(os.getcwd(), 'models')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    ###
    if env_name == 'lake':
        from config_lake import *
    elif env_name == 'car':
        from config_car import *
    else:
        raise

    #### Get a decent policy.
    #### Called pi_old because this will be the policy we use to gather data
    policy_old = None
    old_policy_path = os.path.join(model_dir, old_policy_name)

    if env_name == 'lake':
        policy_old = LakeDQN(
            env,
            gamma,
            action_space_map=action_space_map,
            model_type=model_type,
            position_of_holes=position_of_holes,
            position_of_goals=position_of_goals,
            max_time_spent_in_episode=max_time_spent_in_episode,
            num_iterations=num_iterations,
            sample_every_N_transitions=sample_every_N_transitions,
            batchsize=batchsize,
            min_epsilon=min_epsilon,
            initial_epsilon=initial_epsilon,
            epsilon_decay_steps=epsilon_decay_steps,
            copy_over_target_every_M_training_iterations=
            copy_over_target_every_M_training_iterations,
            buffer_size=buffer_size,
            num_frame_stack=num_frame_stack,
            min_buffer_size_to_train=min_buffer_size_to_train,
            frame_skip=frame_skip,
            pic_size=pic_size,
            models_path=os.path.join(model_dir,
                                     'weights.{epoch:02d}-{loss:.2f}.hdf5'),
        )
    elif env_name == 'car':
        policy_old = CarDQN(
            env,
            gamma,
            action_space_map=action_space_map,
            action_space_dim=action_space_dim,
            model_type=model_type,
            max_time_spent_in_episode=max_time_spent_in_episode,
            num_iterations=num_iterations,
            sample_every_N_transitions=sample_every_N_transitions,
            batchsize=batchsize,
            copy_over_target_every_M_training_iterations=
            copy_over_target_every_M_training_iterations,
            buffer_size=buffer_size,
            min_epsilon=min_epsilon,
            initial_epsilon=initial_epsilon,
            epsilon_decay_steps=epsilon_decay_steps,
            num_frame_stack=num_frame_stack,
            min_buffer_size_to_train=min_buffer_size_to_train,
            frame_skip=frame_skip,
            pic_size=pic_size,
            models_path=os.path.join(model_dir,
                                     'weights.{epoch:02d}-{loss:.2f}.hdf5'),
        )

    else:
        raise

    if not os.path.isfile(old_policy_path):
        print 'Learning a policy using DQN'
        policy_old.learn()
        policy_old.Q.model.save(old_policy_path)
    else:
        print 'Loading a policy'
        policy_old.Q.model = load_model(old_policy_path)
        # if env_name == 'car':
        #     try:
        #         # using old style model. This can be deleted if not using provided .h5 file
        #         policy_old.Q.all_actions_func = K.function([self.model.get_layer('inp').input], [self.model.get_layer('dense_2').output])
        #     except:
        #         pass

    # import pdb; pdb.set_trace()
    if env_name == 'car':
        policy_old.Q.all_actions_func = K.function(
            [policy_old.Q.model.get_layer('inp').input],
            [policy_old.Q.model.get_layer('all_actions').output])

    if env_name == 'lake':
        policy_printer = PrintPolicy(size=[map_size, map_size], env=env)
        policy_printer.pprint(policy_old)

    #### Problem setup
    if env_name == 'lake':
        best_response_algorithm = LakeFittedQIteration(
            state_space_dim + action_space_dim, [map_size, map_size],
            action_space_dim,
            max_Q_fitting_epochs,
            gamma,
            model_type=model_type,
            position_of_goals=position_of_goals,
            position_of_holes=position_of_holes,
            num_frame_stack=num_frame_stack)

        fitted_off_policy_evaluation_algorithm = LakeFittedQEvaluation(
            initial_states,
            state_space_dim + action_space_dim, [map_size, map_size],
            action_space_dim,
            max_eval_fitting_epochs,
            gamma,
            model_type=model_type,
            position_of_goals=position_of_goals,
            position_of_holes=position_of_holes,
            num_frame_stack=num_frame_stack)
        exact_policy_algorithm = ExactPolicyEvaluator(
            action_space_map,
            gamma,
            env=env,
            frame_skip=frame_skip,
            num_frame_stack=num_frame_stack,
            pic_size=pic_size)
    elif env_name == 'car':
        best_response_algorithm = CarFittedQIteration(
            state_space_dim,
            action_space_dim,
            max_Q_fitting_epochs,
            gamma,
            model_type=model_type,
            num_frame_stack=num_frame_stack,
            initialization=policy_old,
            freeze_cnn_layers=freeze_cnn_layers)  # for _ in range(2)]
        fitted_off_policy_evaluation_algorithm = CarFittedQEvaluation(
            state_space_dim,
            action_space_dim,
            max_eval_fitting_epochs,
            gamma,
            model_type=model_type,
            num_frame_stack=num_frame_stack
        )  # for _ in range(2*len(constraints_cared_about) + 2)]
        exact_policy_algorithm = ExactPolicyEvaluator(
            action_space_map,
            gamma,
            env=env,
            frame_skip=frame_skip,
            num_frame_stack=num_frame_stack,
            pic_size=pic_size,
            constraint_thresholds=constraint_thresholds,
            constraints_cared_about=constraints_cared_about)
    else:
        raise

    online_convex_algorithm = ExponentiatedGradient(
        lambda_bound, len(constraints), eta, starting_lambda=starting_lambda)
    exploratory_policy_old = StochasticPolicy(
        policy_old,
        action_space_dim,
        exact_policy_algorithm,
        epsilon=deviation_from_old_policy_eps,
        prob=prob)
    problem = Program(
        constraints,
        action_space_dim,
        best_response_algorithm,
        online_convex_algorithm,
        fitted_off_policy_evaluation_algorithm,
        exact_policy_algorithm,
        lambda_bound,
        epsilon,
        env,
        max_number_of_main_algo_iterations,
        num_frame_stack,
        pic_size,
    )

    lambdas = []
    policies = []

    # print exact_policy_algorithm.run(policy_old.Q, to_monitor=True)

    #### Collect Data
    try:
        print 'Loading Prebuilt Data'
        tic = time.time()
        # problem.dataset.data = dd.io.load('%s_data.h5' % env_name)
        # print 'Loaded. Time elapsed: %s' % (time.time() - tic)
        # num of times breaking  + distance to center of track + zeros
        if env_name == 'car':
            tic = time.time()
            action_data = dd.io.load(
                './seed_2_data/car_data_actions_seed_2.h5')
            frame_data = dd.io.load('./seed_2_data/car_data_frames_seed_2.h5')
            done_data = dd.io.load('./seed_2_data/car_data_is_done_seed_2.h5')
            next_state_data = dd.io.load(
                './seed_2_data/car_data_next_states_seed_2.h5')
            current_state_data = dd.io.load(
                './seed_2_data/car_data_prev_states_seed_2.h5')
            cost_data = dd.io.load('./seed_2_data/car_data_rewards_seed_2.h5')

            frame_gray_scale = np.zeros(
                (len(frame_data), 96, 96)).astype('float32')
            for i in range(len(frame_data)):
                frame_gray_scale[i, :, :] = np.dot(
                    frame_data[i, :, :, :] / 255., [0.299, 0.587, 0.114])

            problem.dataset.data = {
                'frames': frame_gray_scale,
                'prev_states': current_state_data,
                'next_states': next_state_data,
                'a': action_data,
                'c': cost_data[:, 0],
                'g': cost_data[:, 1:],
                'done': done_data
            }

            problem.dataset.data['g'] = problem.dataset.data[
                'g'][:, constraints_cared_about]
            # problem.dataset.data['g'] = (problem.dataset.data['g'] >= constraint_thresholds[:-1]).astype(int)
            print 'Preprocessed g. Time elapsed: %s' % (time.time() - tic)
        else:
            raise
    except:
        print 'Failed to load'
        print 'Recreating dataset'
        num_goal = 0
        num_hole = 0
        dataset_size = 0
        main_tic = time.time()
        # from layer_visualizer import LayerVisualizer; LV = LayerVisualizer(exploratory_policy_old.policy.Q.model)
        for i in range(max_epochs):
            tic = time.time()
            x = env.reset()
            problem.collect(x, start=True)
            dataset_size += 1
            if env_name in ['car']: env.render()
            done = False
            time_steps = 0
            episode_cost = 0
            while not done:
                time_steps += 1
                if env_name in ['car']:
                    #
                    # epsilon decay
                    exploratory_policy_old.epsilon = 1. - np.exp(
                        -3 * (i / float(max_epochs)))

                #LV.display_activation([problem.dataset.current_state()[np.newaxis,...], np.atleast_2d(np.eye(12)[0])], 2, 2, 0)
                action = exploratory_policy_old(
                    [problem.dataset.current_state()], x_preprocessed=False)[0]
                cost = []
                for _ in range(frame_skip):
                    if env_name in ['car']: env.render()
                    x_prime, costs, done, _ = env.step(
                        action_space_map[action])
                    cost.append(costs)
                    if done:
                        break
                cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0)
                early_done, punishment = env.is_early_episode_termination(
                    cost=cost[0],
                    time_steps=time_steps,
                    total_cost=episode_cost)
                # print cost, action_space_map[action] #env.car.fuel_spent/ENGINE_POWER, env.tile_visited_count, len(env.track), env.tile_visited_count/float(len(env.track))
                done = done or early_done

                # if done and reward: num_goal += 1
                # if done and not reward: num_hole += 1
                episode_cost += cost[0] + punishment
                c = (cost[0] + punishment).tolist()
                g = cost[1:].tolist()
                if len(g) < len(constraints): g = np.hstack([g, 0])
                problem.collect(
                    action,
                    x_prime,  #np.dot(x_prime/255. , [0.299, 0.587, 0.114]),
                    np.hstack([c, g]).reshape(-1).tolist(),
                    done)  #{(x,a,x',c(x,a), g(x,a)^T, done)}
                dataset_size += 1
                x = x_prime
            if (i % 1) == 0:
                print 'Epoch: %s. Exploration probability: %s' % (
                    i,
                    np.round(exploratory_policy_old.epsilon, 5),
                )
                print 'Dataset size: %s Time Elapsed: %s. Total time: %s' % (
                    dataset_size, time.time() - tic, time.time() - main_tic)
                if env_name in ['car']:
                    print 'Performance: %s/%s = %s' % (
                        env.tile_visited_count, len(env.track),
                        env.tile_visited_count / float(len(env.track)))
                print '*' * 20
        problem.finish_collection(env_name)

    if env_name in ['lake']:
        problem.dataset['x'] = problem.dataset['frames'][
            problem.dataset['prev_states']]
        problem.dataset['x_prime'] = problem.dataset['frames'][
            problem.dataset['next_states']]
        problem.dataset['g'] = problem.dataset['g'][:, 0:1]
        print 'x Distribution:'
        print np.histogram(problem.dataset['x'],
                           bins=np.arange(map_size**2 + 1) - .5)[0].reshape(
                               map_size, map_size)

        print 'x_prime Distribution:'
        print np.histogram(problem.dataset['x_prime'],
                           bins=np.arange(map_size**2 + 1) - .5)[0].reshape(
                               map_size, map_size)

        print 'Number episodes achieved goal: %s. Number episodes fell in hole: %s' % (
            -problem.dataset['c'].sum(axis=0),
            problem.dataset['g'].sum(axis=0)[0])

        number_of_total_state_action_pairs = (state_space_dim - np.sum(
            env.desc == 'H') - np.sum(env.desc == 'G')) * action_space_dim
        number_of_state_action_pairs_seen = len(
            np.unique(np.hstack([
                problem.dataset['x'].reshape(1, -1).T,
                problem.dataset['a'].reshape(1, -1).T
            ]),
                      axis=0))
        print 'Percentage of State/Action space seen: %s' % (
            number_of_state_action_pairs_seen /
            float(number_of_total_state_action_pairs))

    # print 'C(pi_old): %s. G(pi_old): %s' % (exact_policy_algorithm.run(exploratory_policy_old,policy_is_greedy=False, to_monitor=True) )
    ### Solve Batch Constrained Problem

    iteration = 0
    while not problem.is_over(policies,
                              lambdas,
                              infinite_loop=infinite_loop,
                              calculate_gap=calculate_gap,
                              results_name=results_name,
                              policy_improvement_name=policy_improvement_name):
        iteration += 1
        K.clear_session()
        for i in range(1):

            # policy_printer.pprint(policies)
            print '*' * 20
            print 'Iteration %s, %s' % (iteration, i)
            print
            if len(lambdas) == 0:
                # first iteration
                lambdas.append(online_convex_algorithm.get())
                print 'lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i)
            else:
                # all other iterations
                lambda_t = problem.online_algo()
                lambdas.append(lambda_t)
                print 'lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format(
                    iteration, iteration - 1, lambdas[-1], i)

            lambda_t = lambdas[-1]
            pi_t, values = problem.best_response(lambda_t,
                                                 desc='FQI pi_{0}_{1}'.format(
                                                     iteration, i),
                                                 exact=exact_policy_algorithm)

            # policies.append(pi_t)
            problem.update(pi_t, values,
                           iteration)  #Evaluate C(pi_t), G(pi_t) and save
コード例 #5
0
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 num_agents=1,
                 rnd_type='rnd',
                 div_type='oracle',
                 indep_rnd=False,
                 indep_policy=False,
                 sd_type='oracle',
                 rnd_mask_prob=1.):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512

        self.rnd_mask = tf.placeholder(dtype=tf.float32,
                                       shape=(None, None, num_agents),
                                       name="rnd_mask")
        self.new_rnd_mask = tf.placeholder(dtype=tf.float32,
                                           shape=(None, None),
                                           name="new_rnd_mask")
        self.div_train_mask = tf.placeholder(dtype=tf.float32,
                                             shape=(None, None),
                                             name="div_train_mask")
        self.sample_agent_prob = tf.placeholder(dtype=tf.float32,
                                                shape=(
                                                    None,
                                                    None,
                                                ),
                                                name="sample_agent_prob")
        self.stage_label = tf.placeholder(dtype=tf.int32,
                                          shape=(None, None),
                                          name="stage_label")

        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        self.ph_count = tf.placeholder(dtype=tf.float32,
                                       shape=(),
                                       name="obcount")

        self.sep_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=(
                                              None,
                                              None,
                                          ) + ob_space.shape[:2] + (1, ),
                                          name="sep_obmean")
        self.sep_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=(
                                             None,
                                             None,
                                         ) + ob_space.shape[:2] + (1, ),
                                         name="sep_obstd")
        self.sep_ph_count = tf.placeholder(dtype=tf.float32,
                                           shape=(),
                                           name="sep_obcount")

        self.game_score = tf.placeholder(dtype=tf.float32,
                                         shape=(None, None),
                                         name="game_score")
        self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype,
                                          shape=(None, None) +
                                          tuple(ob_space.shape),
                                          name="last_rew_ob")

        self.div_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=list(ob_space.shape[:2]) + [1],
                                          name="div_obmean")
        self.div_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="div_obstd")

        self.idle_agent_label = tf.placeholder(dtype=tf.int32,
                                               shape=(
                                                   None,
                                                   None,
                                               ),
                                               name="idle_agent_label")
        self.rew_agent_label = tf.placeholder(dtype=tf.int32,
                                              shape=(
                                                  None,
                                                  None,
                                              ),
                                              name="rew_agent_label")

        #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0))

        self.sd_ph_mean = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="sd_obmean")
        self.sd_ph_std = tf.placeholder(dtype=tf.float32,
                                        shape=list(ob_space.shape[:2]) + [1],
                                        name="sd_obstd")

        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement

        self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \
                                for _ in range(num_agents)]
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        self.diversity_ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]

        self.memsize = memsize
        self.num_agents = num_agents
        self.indep_rnd = indep_rnd
        self.indep_policy = indep_policy

        self.num_agents = num_agents

        if num_agents <= 0:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
        else:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_multi_head_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_multi_head_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )

        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)
            self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew(
                convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)

        self.stage_rnd = tf.constant(1.)
        self.stage_prob = tf.constant(1.)

        if div_type == 'cls':
            with tf.variable_scope("div", reuse=False):
                #self.define_rew_discriminator(convfeat=convfeat, rep_size=256)
                with tf.variable_scope("int", reuse=False):
                    self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2(
                        convfeat=convfeat, rep_size=512, use_rew=True)
        else:
            self.div_rew = tf.constant(0.)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
コード例 #6
0
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 action_balance_coef=1.,
                 array_action=True):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        self.action_balance_coef = action_balance_coef
        self.array_action = array_action

        self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        self.rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= self.enlargement
        hidsize *= self.enlargement
        self.convfeat = 16 * self.enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # self.int_rew_ab = None
        # self.int_rew_ab_opt = None
        if self.action_balance_coef is not None:
            # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)
            # with tf.device('/cpu:0'):
            self.action_one_hot_rollout = get_action_one_hot(
                self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)

            if self.array_action:
                # with tf.device('/cpu:0'):
                self.action_encode_array_rollout = get_action_encode_array(
                    self.ac_space.n, self.sy_nenvs, self.sy_nsteps,
                    ob_space.shape[:2])
                # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array(
                #     self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2])

            self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \
                self.define_action_balance_rew(ph_ob=self.ph_ob[None],
                                               action_one_hot=self.action_one_hot_rollout,
                                               convfeat=self.convfeat,
                                               rep_size=self.rep_size, enlargement=self.enlargement,
                                               sy_nenvs=self.sy_nenvs,
                                               sy_nsteps=self.sy_nsteps,
                                               )
            # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \
            #     self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1],
            #                                    action_one_hot=self.action_one_hot_list_opt,
            #                                    convfeat=self.convfeat,
            #                                    rep_size=self.rep_size, enlargement=self.enlargement,
            #                                    sy_nenvs=self.sy_nenvs,
            #                                    sy_nsteps=self.sy_nsteps - 1,
            #                                    )

            self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab)

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \
            self.apply_policy(self.ph_ob[None][:, :-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=self.convfeat,
                                                rep_size=self.rep_size,
                                                enlargement=self.enlargement)
        else:
            self.define_self_prediction_rew(convfeat=self.convfeat,
                                            rep_size=self.rep_size,
                                            enlargement=self.enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate