def __init__(
        self,
        name,
        env_spec,
        conv_filters,
        conv_filter_sizes,
        conv_strides,
        conv_pads,
        hidden_sizes=[],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec
        # import pdb; pdb.set_trace()
        if prob_network is None:
            prob_network = ConvNetwork(
                input_shape=env_spec.observation_space.shape,
                output_dim=env_spec.action_space.n,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name="prob_network",
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = tensor_utils.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalConvPolicy, self).__init__(env_spec)
        LayersPowered.__init__(self, [prob_network.output_layer])
Esempio n. 2
0
def get_value_network(env):
    value_network = ConvNetwork(
        name='value_network',
        input_shape=env.observation_space.shape,
        output_dim=1,
        # number of channels/filters for each conv layer
        conv_filters=(16, 32),
        # filter size
        conv_filter_sizes=(8, 4),
        conv_strides=(4, 2),
        conv_pads=('VALID', 'VALID'),
        hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
        batch_normalization=False)
    return value_network
Esempio n. 3
0
def get_policy_network(env):
    policy_network = ConvNetwork(
        name='prob_network',
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.n,
        # number of channels/filters for each conv layer
        conv_filters=(16, 32),
        # filter size
        conv_filter_sizes=(8, 4),
        conv_strides=(4, 2),
        conv_pads=('VALID', 'VALID'),
        hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        batch_normalization=False,
    )
    return policy_network
    def _make_subnetwork(self,
                         input_layer,
                         dim_output,
                         hidden_sizes,
                         output_nonlinearity=tf.sigmoid,
                         hidden_nonlinearity=tf.nn.tanh,
                         name="pred_network",
                         conv_filters=None,
                         conv_filter_sizes=None,
                         conv_strides=None,
                         conv_pads=None,
                         input_shape=None):

        if conv_filters is not None:
            input_layer = L.reshape(input_layer, ([0], ) + input_shape,
                                    name="reshape_input")
            prob_network = ConvNetwork(input_shape=input_shape,
                                       output_dim=dim_output,
                                       hidden_sizes=hidden_sizes,
                                       hidden_nonlinearity=hidden_nonlinearity,
                                       output_nonlinearity=output_nonlinearity,
                                       name=name,
                                       input_layer=input_layer,
                                       conv_filters=conv_filters,
                                       conv_filter_sizes=conv_filter_sizes,
                                       conv_strides=conv_strides,
                                       conv_pads=conv_pads)
        else:
            prob_network = MLP(output_dim=dim_output,
                               hidden_sizes=hidden_sizes,
                               hidden_nonlinearity=hidden_nonlinearity,
                               output_nonlinearity=output_nonlinearity,
                               name=name,
                               input_layer=input_layer)

        return prob_network.output_layer
Esempio n. 5
0
stub(globals())

# Params range
seeds = range(0, 5)

for seed in seeds:
    env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v14',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    env_spec = env.spec
    policy_cnn = ConvNetwork(
        name="policy_conv_network",
        input_shape=env_spec.observation_space.shape,
        output_dim=env_spec.action_space.flat_dim,
        conv_filters=(64, 64, 64, 32),
        conv_filter_sizes=((5, 5), (3, 3), (3, 3), (3, 3)),
        conv_strides=(3, 3, 3, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
    )

    baseline_cnn = ConvNetwork(
        name="baseline_conv_network",
        input_shape=env_spec.observation_space.shape,
        output_dim=env_spec.action_space.flat_dim,
        conv_filters=(64, 64, 64, 32),
        conv_filter_sizes=((5, 5), (3, 3), (3, 3), (3, 3)),
        conv_strides=(3, 3, 3, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256, ),
Esempio n. 6
0
stub(globals())

# Params range
seeds = range(2, 4)

for seed in seeds:
    env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v6',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))
    
    env_spec = env.spec
    cnn = ConvNetwork(
        name="conv_feature_network",
        input_shape=env_spec.observation_space.shape,
        output_dim=env_spec.action_space.flat_dim,
        conv_filters=(32, 32, 32, 32, 32),
        conv_filter_sizes=((3,3),(3,3),(3,3),(3,3), (3,3)),
        conv_strides=(2, 2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256,),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
    )

    policy = GaussianConvFeaturePolicy(
        "conv_feature_policy",
        env_spec=env_spec,
        feature_network=cnn,
        hidden_sizes=(128,64),
        output_nonlinearity=tf.nn.tanh,
    )

    baseline = NNBaseline(
Esempio n. 7
0
    def forward_CNN_MLP(
            self,
            name,
            all_params,
            conv_filters,
            conv_filter_sizes,
            conv_strides,
            conv_pads,
            conv_output_dim,
            conv_hidden_sizes,  # new
            input_tensor=None,
            batch_normalization=False,
            reuse=True,
            is_training=False):
        # is_training and reuse are for batch norm, irrelevant if batch_norm set to False
        # set reuse to False if the first time this func is called.
        with tf.variable_scope(name):
            if input_tensor is None:
                l_in = make_input(shape=self.input_total_shape,
                                  input_var=None,
                                  name='input')
            else:
                l_in = input_tensor

            # l_img_in = l_in[:][:np.prod(self.input_img_shape)]
            l_img_in = tf.slice(l_in, [0, 0],
                                [-1, np.prod(self.input_img_shape)])
            # l_state_in = l_in[:][np.prod(self.input_img_shape):]
            l_state_in = tf.slice(l_in, [0, np.prod(self.input_img_shape)],
                                  [-1, -1])
            # print("Debug212",l_img_in, l_state_in)
            l_normalized_img_in = tf.cast(l_img_in, tf.float32) / 255
            # if self.cnn is None:
            self.cnn = ConvNetwork(
                name=name + "cnn",
                input_shape=self.input_img_shape,
                output_dim=conv_output_dim,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=conv_hidden_sizes,
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=L.spatial_expected_softmax,
                input_var=l_normalized_img_in)
            #     cnn = self.cnn
            # else:
            #     self.cnn2 = ConvNetwork(name=name+"cnn2",input_shape=self.input_img_shape,output_dim=conv_output_dim,
            #               conv_filters=conv_filters,conv_filter_sizes=conv_filter_sizes,conv_strides=conv_strides,
            #               conv_pads=conv_pads, hidden_sizes=conv_hidden_sizes,hidden_nonlinearity=tf.nn.relu,output_nonlinearity=L.spatial_expected_softmax, input_var=l_normalized_img_in)
            #     cnn = self.cnn2

            # print("debug234, cnn output layer", L.get_output(self.cnn._l_out))
            l_hid = tf.concat(
                [l_state_in, L.get_output(self.cnn.output_layer)], -1,
                'post_conv_input')
            # l_hid = tf.concat([l_state_in,l_normalized_img_in],-1,'post_conv_input')

            # l_hid=l_in
            for idx in range(self.n_hidden):
                l_hid = forward_dense_layer(
                    l_hid,
                    all_params['W' + str(idx)],
                    all_params['b' + str(idx)],
                    batch_norm=batch_normalization,
                    nonlinearity=self.hidden_nonlinearity,
                    scope=str(idx),
                    reuse=reuse,
                    is_training=is_training)
            output = forward_dense_layer(
                l_hid,
                all_params['W' + str(self.n_hidden)],
                all_params['b' + str(self.n_hidden)],
                batch_norm=False,
                nonlinearity=self.output_nonlinearity,
            )
            return l_in, output
Esempio n. 8
0
def rllab_envpolicy_parser(env, args):
    if isinstance(args, dict):
        args = tonamedtuple(args)

    env = RLLabEnv(env, mode=args.control)
    if args.algo[:2] == 'tf':
        env = TfEnv(env)

        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            elif args.conv:
                strides = tuple(args.conv_strides)
                chans = tuple(args.conv_channels)
                filts = tuple(args.conv_filters)

                assert len(strides) == len(chans) == len(
                    filts), "strides, chans and filts not equal"
                # only discrete actions supported, should be straightforward to extend to continuous
                assert isinstance(
                    env.spec.action_space,
                    Discrete), "Only discrete action spaces support conv"
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=env.spec.observation_space.shape,
                    output_dim=args.feature_output,
                    conv_filters=chans,
                    conv_filter_sizes=filts,
                    conv_strides=strides,
                    conv_pads=('VALID', ) * len(chans),
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.action_space, Box):
                    policy = GaussianGRUPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden[0]),
                                               name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    policy = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='policy',
                        state_include_action=False if args.conv else True)
                else:
                    raise NotImplementedError(env.spec.observation_space)

            elif args.recurrent == 'lstm':
                if isinstance(env.spec.action_space, Box):
                    policy = GaussianLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    policy = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                else:
                    raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        elif args.conv:
            strides = tuple(args.conv_strides)
            chans = tuple(args.conv_channels)
            filts = tuple(args.conv_filters)

            assert len(strides) == len(chans) == len(
                filts), "strides, chans and filts not equal"
            # only discrete actions supported, should be straightforward to extend to continuous
            assert isinstance(
                env.spec.action_space,
                Discrete), "Only discrete action spaces support conv"
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=env.spec.action_space.n,
                conv_filters=chans,
                conv_filter_sizes=filts,
                conv_strides=strides,
                conv_pads=('VALID', ) * len(chans),
                hidden_sizes=tuple(args.policy_hidden),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            if isinstance(env.spec.action_space, Box):
                policy = GaussianMLPPolicy(env_spec=env.spec,
                                           hidden_sizes=tuple(
                                               args.policy_hidden),
                                           min_std=args.min_std,
                                           name='policy')
            elif isinstance(env.spec.action_space, Discrete):
                policy = CategoricalMLPPolicy(env_spec=env.spec,
                                              hidden_sizes=tuple(
                                                  args.policy_hidden),
                                              name='policy')
            else:
                raise NotImplementedError(env.spec.action_space)
    elif args.algo[:2] == 'th':
        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = thMLP(
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.observation_space, thBox):
                    policy = thGaussianGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                    )
                elif isinstance(env.spec.observation_space, thDiscrete):
                    policy = thCategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                    )
                else:
                    raise NotImplementedError(env.spec.observation_space)

            # elif args.recurrent == 'lstm':
            #     if isinstance(env.spec.action_space, thBox):
            #         policy = thGaussianLSTMPolicy(env_spec=env.spec,
            #                                       feature_network=feature_network,
            #                                       hidden_dim=int(args.policy_hidden),
            #                                       name='policy')
            #     elif isinstance(env.spec.action_space, thDiscrete):
            #         policy = thCategoricalLSTMPolicy(env_spec=env.spec,
            #                                          feature_network=feature_network,
            #                                          hidden_dim=int(args.policy_hidden),
            #                                          name='policy')
            #     else:
            #         raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        else:
            if args.algo == 'thddpg':
                assert isinstance(env.spec.action_space, thBox)
                policy = thDeterministicMLPPolicy(
                    env_spec=env.spec,
                    hidden_sizes=tuple(args.policy_hidden),
                )
            else:
                if isinstance(env.spec.action_space, thBox):
                    policy = thGaussianMLPPolicy(env_spec=env.spec,
                                                 hidden_sizes=tuple(
                                                     args.policy_hidden),
                                                 min_std=args.min_std)
                elif isinstance(env.spec.action_space, thDiscrete):
                    policy = thCategoricalMLPPolicy(env_spec=env.spec,
                                                    hidden_sizes=tuple(
                                                        args.policy_hidden),
                                                    min_std=args.min_std)
                else:
                    raise NotImplementedError(env.spec.action_space)

    if args.control == 'concurrent':
        return env, policies
    else:
        return env, policy
Esempio n. 9
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.checkpoint:
        with tf.Session() as sess:
            data = joblib.load(args.checkpoint)
            policy = data['policy']
            env = data['env']
    else:
        if args.sample_maps:
            map_pool = np.load(args.map_file)
        else:
            if args.map_type == 'rectangle':
                env_map = TwoDMaps.rectangle_map(
                    *map(int, args.rectangle.split(',')))
            elif args.map_type == 'complex':
                env_map = TwoDMaps.complex_map(
                    *map(int, args.rectangle.split(',')))
            else:
                raise NotImplementedError()
            map_pool = [env_map]

        env = PursuitEvade(map_pool,
                           n_evaders=args.n_evaders,
                           n_pursuers=args.n_pursuers,
                           obs_range=args.obs_range,
                           n_catch=args.n_catch,
                           train_pursuit=args.train_pursuit,
                           urgency_reward=args.urgency,
                           surround=args.surround,
                           sample_maps=args.sample_maps,
                           constraint_window=args.constraint_window,
                           flatten=args.flatten,
                           reward_mech=args.reward_mech,
                           catchr=args.catchr,
                           term_pursuit=args.term_pursuit)

        env = TfEnv(
            RLLabEnv(StandardizedEnv(env,
                                     scale_reward=args.reward_scale,
                                     enable_obsnorm=False),
                     mode=args.control))

        if args.recurrent:
            if args.conv:
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=emv.spec.observation_space.shape,
                    output_dim=5,
                    conv_filters=(16, 32, 32),
                    conv_filter_sizes=(3, 3, 3),
                    conv_strides=(1, 1, 1),
                    conv_pads=('VALID', 'VALID', 'VALID'),
                    hidden_sizes=(64, ),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=tf.nn.softmax)
            else:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=5,
                    hidden_sizes=(256, 128, 64),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            if args.recurrent == 'gru':
                policy = CategoricalGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden_sizes),
                                              name='policy')
            elif args.recurrent == 'lstm':
                policy = CategoricalLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden_sizes),
                                               name='policy')
        elif args.conv:
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=5,
                conv_filters=(8, 16),
                conv_filter_sizes=(3, 3),
                conv_strides=(2, 1),
                conv_pads=('VALID', 'VALID'),
                hidden_sizes=(32, ),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control,
    )

    algo.train()
Esempio n. 10
0
    def parse_env_args(self, env, args):

        if isinstance(args, dict):
            args = to_named_tuple(args)

        # Multi-agent wrapper
        env = RLLabEnv(env, ma_mode=args.control)
        env = MATfEnv(env)

        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            elif args.conv:
                strides = tuple(args.conv_strides)
                chans = tuple(args.conv_channels)
                filts = tuple(args.conv_filters)

                assert len(strides) == len(chans) == len(
                    filts), "strides, chans and filts not equal"
                # only discrete actions supported, should be straightforward to extend to continuous
                assert isinstance(
                    env.spec.action_space,
                    Discrete), "Only discrete action spaces support conv"
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=env.spec.observation_space.shape,
                    output_dim=args.feature_output,
                    conv_filters=chans,
                    conv_filter_sizes=filts,
                    conv_strides=strides,
                    conv_pads=('VALID', ) * len(chans),
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.action_space, Box):
                    if args.control == 'concurrent':
                        policies = [
                            GaussianGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden[0]),
                                              name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    policy = GaussianGRUPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden[0]),
                                               name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    if args.control == 'concurrent':
                        policies = [
                            CategoricalGRUPolicy(
                                env_spec=env.spec,
                                feature_network=feature_network,
                                hidden_dim=int(args.policy_hidden[0]),
                                name='policy_{}'.format(agid),
                                state_include_action=False
                                if args.conv else True)
                            for agid in range(len(env.agents))
                        ]
                    q_network = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='q_network',
                        state_include_action=False if args.conv else True)
                    target_q_network = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='target_q_network',
                        state_include_action=False if args.conv else True)
                    policy = {
                        'q_network': q_network,
                        'target_q_network': target_q_network
                    }
                else:
                    raise NotImplementedError(env.spec.observation_space)

            elif args.recurrent == 'lstm':
                if isinstance(env.spec.action_space, Box):
                    if args.control == 'concurrent':
                        policies = [
                            GaussianLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden),
                                               name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    policy = GaussianLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    if args.control == 'concurrent':
                        policies = [
                            CategoricalLSTMPolicy(
                                env_spec=env.spec,
                                feature_network=feature_network,
                                hidden_dim=int(args.policy_hidden),
                                name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    q_network = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='q_network')
                    target_q_network = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='target_q_network')
                    policy = {
                        'q_network': q_network,
                        'target_q_network': target_q_network
                    }
                else:
                    raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        elif args.conv:
            strides = tuple(args.conv_strides)
            chans = tuple(args.conv_channels)
            filts = tuple(args.conv_filters)

            assert len(strides) == len(chans) == len(
                filts), "strides, chans and filts not equal"
            # only discrete actions supported, should be straightforward to extend to continuous
            assert isinstance(
                env.spec.action_space,
                Discrete), "Only discrete action spaces support conv"
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=env.spec.action_space.n,
                conv_filters=chans,
                conv_filter_sizes=filts,
                conv_strides=strides,
                conv_pads=(args.conv_pads, ) * len(chans),
                hidden_sizes=tuple(args.policy_hidden),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax,
                batch_normalization=args.batch_normalization)
            if args.algo == 'dqn':
                q_network = CategoricalMLPPolicy(name='q_network',
                                                 env_spec=env.spec,
                                                 prob_network=feature_network)
                target_q_network = CategoricalMLPPolicy(
                    name='target_q_network',
                    env_spec=env.spec,
                    prob_network=feature_network)
                policy = {
                    'q_network': q_network,
                    'target_q_network': target_q_network
                }

            else:
                policy = CategoricalMLPPolicy(name='policy',
                                              env_spec=env.spec,
                                              prob_network=feature_network)
        else:
            if env.spec is None:

                networks = [
                    DQNNetwork(i,
                               env,
                               target_network_update_freq=self.args.
                               target_network_update,
                               discount_factor=self.args.discount,
                               batch_size=self.args.batch_size,
                               learning_rate=self.args.qfunc_lr)
                    for i in range(env.n)
                ]

                policy = networks

            elif isinstance(env.spec.action_space, Box):
                policy = GaussianMLPPolicy(env_spec=env.spec,
                                           hidden_sizes=tuple(
                                               args.policy_hidden),
                                           min_std=args.min_std,
                                           name='policy')
            elif isinstance(env.spec.action_space, Discrete):
                policy = CategoricalMLPPolicy(env_spec=env.spec,
                                              hidden_sizes=tuple(
                                                  args.policy_hidden),
                                              name='policy')
            else:
                raise NotImplementedError(env.spec.action_space)

        return env, policy
    def __init__(self,
                 name,
                 env_spec,
                 conv_filters,
                 conv_filter_sizes,
                 conv_strides,
                 conv_pads,
                 hidden_sizes=(32, 32),
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_hidden_sizes=(32, 32),
                 min_std=1e-6,
                 std_hidden_nonlinearity=tf.nn.tanh,
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=None,
                 mean_network=None,
                 std_network=None,
                 std_parametrization='exp'):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        with tf.variable_scope(name):

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            # create network
            if mean_network is None:
                mean_network = ConvNetwork(
                    input_shape=env_spec.observation_space.shape,
                    output_dim=env_spec.action_space.flat_dim,
                    conv_filters=conv_filters,
                    conv_filter_sizes=conv_filter_sizes,
                    conv_strides=conv_strides,
                    conv_pads=conv_pads,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=self.linear,
                    name="prob_network",
                )

            self._mean_network = mean_network

            l_mean = mean_network.output_layer
            obs_var = mean_network.input_layer.input_var

            if std_network is not None:
                l_std_param = std_network.output_layer
            else:
                if adaptive_std:
                    std_network = ConvNetwork(
                        input_shape=env_spec.observation_space.shape,
                        output_dim=env_spec.action_space.n,
                        conv_filters=conv_filters,
                        conv_filter_sizes=conv_filter_sizes,
                        conv_strides=conv_strides,
                        conv_pads=conv_pads,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=self.linear,
                        name="std_network",
                    )
                    l_std_param = std_network.output_layer
                else:
                    if std_parametrization == 'exp':
                        init_std_param = np.log(init_std)
                    elif std_parametrization == 'softplus':
                        init_std_param = np.log(np.exp(init_std) - 1)
                    else:
                        raise NotImplementedError
                    l_std_param = L.ParamLayer(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=tf.constant_initializer(init_std_param),
                        name="output_std_param",
                        trainable=learn_std,
                    )

            self.std_parametrization = std_parametrization

            if std_parametrization == 'exp':
                min_std_param = np.log(min_std)
            elif std_parametrization == 'softplus':
                min_std_param = np.log(np.exp(min_std) - 1)
            else:
                raise NotImplementedError

            self.min_std_param = min_std_param

            # mean_var, log_std_var = L.get_output([l_mean, l_std_param])
            #
            # if self.min_std_param is not None:
            #     log_std_var = tf.maximum(log_std_var, np.log(min_std))
            #
            # self._mean_var, self._log_std_var = mean_var, log_std_var

            self._l_mean = l_mean
            self._l_std_param = l_std_param

            self._dist = DiagonalGaussian(action_dim)

            LayersPowered.__init__(self, [l_mean, l_std_param])
            super(GaussianConvPolicy, self).__init__(env_spec)

            dist_info_sym = self.dist_info_sym(
                mean_network.input_layer.input_var, dict())
            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            self._f_dist = tensor_utils.compile_function(
                inputs=[obs_var],
                outputs=[mean_var, log_std_var],
            )
Esempio n. 12
0
def main(_):

    env = TfEnv(
        AtariEnv(args.env,
                 force_reset=True,
                 record_video=False,
                 record_log=False,
                 resize_size=args.resize_size,
                 atari_noop=args.atari_noop,
                 atari_eplife=args.atari_eplife,
                 atari_firereset=args.atari_firereset))

    policy_network = ConvNetwork(
        name='prob_network',
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.n,
        # number of channels/filters for each conv layer
        conv_filters=(16, 32),
        # filter size
        conv_filter_sizes=(8, 4),
        conv_strides=(4, 2),
        conv_pads=('VALID', 'VALID'),
        hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        batch_normalization=False)
    policy = CategoricalMLPPolicy(name='policy',
                                  env_spec=env.spec,
                                  prob_network=policy_network)

    if (args.value_function == 'zero'):
        baseline = ZeroBaseline(env.spec)
    else:
        value_network = get_value_network(env)
        baseline_batch_size = args.batch_size * 10

        if (args.value_function == 'conj'):
            baseline_optimizer = ConjugateGradientOptimizer(
                subsample_factor=1.0, num_slices=args.num_slices)
        elif (args.value_function == 'adam'):
            baseline_optimizer = FirstOrderOptimizer(
                max_epochs=3,
                batch_size=512,
                num_slices=args.num_slices,
                verbose=True)
        else:
            logger.log("Inappropirate value function")
            exit(0)
        '''
      baseline = GaussianMLPBaseline(
          env.spec,
          num_slices=args.num_slices,
          regressor_args=dict(
              step_size=0.01,
              mean_network=value_network,
              optimizer=baseline_optimizer,
              subsample_factor=1.0,
              batchsize=baseline_batch_size,
              use_trust_region=False
          )
      )
      '''
        baseline = DeterministicMLPBaseline(env.spec,
                                            num_slices=args.num_slices,
                                            regressor_args=dict(
                                                network=value_network,
                                                optimizer=baseline_optimizer,
                                                normalize_inputs=False))

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=4500,
                n_itr=args.n_itr,
                discount=args.discount_factor,
                step_size=args.step_size,
                clip_reward=(not args.reward_no_scale),
                optimizer_args={
                    "subsample_factor": 1.0,
                    "num_slices": args.num_slices
                }
                #       plot=True
                )

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=args.n_cpu,
                            inter_op_parallelism_threads=args.n_cpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()
    algo.train(sess)