def __init__( self, name, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) self._env_spec = env_spec # import pdb; pdb.set_trace() if prob_network is None: prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def get_value_network(env): value_network = ConvNetwork( name='value_network', input_shape=env.observation_space.shape, output_dim=1, # number of channels/filters for each conv layer conv_filters=(16, 32), # filter size conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pads=('VALID', 'VALID'), hidden_sizes=(256, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, batch_normalization=False) return value_network
def get_policy_network(env): policy_network = ConvNetwork( name='prob_network', input_shape=env.observation_space.shape, output_dim=env.action_space.n, # number of channels/filters for each conv layer conv_filters=(16, 32), # filter size conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pads=('VALID', 'VALID'), hidden_sizes=(256, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, batch_normalization=False, ) return policy_network
def _make_subnetwork(self, input_layer, dim_output, hidden_sizes, output_nonlinearity=tf.sigmoid, hidden_nonlinearity=tf.nn.tanh, name="pred_network", conv_filters=None, conv_filter_sizes=None, conv_strides=None, conv_pads=None, input_shape=None): if conv_filters is not None: input_layer = L.reshape(input_layer, ([0], ) + input_shape, name="reshape_input") prob_network = ConvNetwork(input_shape=input_shape, output_dim=dim_output, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name=name, input_layer=input_layer, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads) else: prob_network = MLP(output_dim=dim_output, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name=name, input_layer=input_layer) return prob_network.output_layer
stub(globals()) # Params range seeds = range(0, 5) for seed in seeds: env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v14',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) env_spec = env.spec policy_cnn = ConvNetwork( name="policy_conv_network", input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.flat_dim, conv_filters=(64, 64, 64, 32), conv_filter_sizes=((5, 5), (3, 3), (3, 3), (3, 3)), conv_strides=(3, 3, 3, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, ) baseline_cnn = ConvNetwork( name="baseline_conv_network", input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.flat_dim, conv_filters=(64, 64, 64, 32), conv_filter_sizes=((5, 5), (3, 3), (3, 3), (3, 3)), conv_strides=(3, 3, 3, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256, ),
stub(globals()) # Params range seeds = range(2, 4) for seed in seeds: env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v6',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) env_spec = env.spec cnn = ConvNetwork( name="conv_feature_network", input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.flat_dim, conv_filters=(32, 32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3), (3,3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, ) policy = GaussianConvFeaturePolicy( "conv_feature_policy", env_spec=env_spec, feature_network=cnn, hidden_sizes=(128,64), output_nonlinearity=tf.nn.tanh, ) baseline = NNBaseline(
def forward_CNN_MLP( self, name, all_params, conv_filters, conv_filter_sizes, conv_strides, conv_pads, conv_output_dim, conv_hidden_sizes, # new input_tensor=None, batch_normalization=False, reuse=True, is_training=False): # is_training and reuse are for batch norm, irrelevant if batch_norm set to False # set reuse to False if the first time this func is called. with tf.variable_scope(name): if input_tensor is None: l_in = make_input(shape=self.input_total_shape, input_var=None, name='input') else: l_in = input_tensor # l_img_in = l_in[:][:np.prod(self.input_img_shape)] l_img_in = tf.slice(l_in, [0, 0], [-1, np.prod(self.input_img_shape)]) # l_state_in = l_in[:][np.prod(self.input_img_shape):] l_state_in = tf.slice(l_in, [0, np.prod(self.input_img_shape)], [-1, -1]) # print("Debug212",l_img_in, l_state_in) l_normalized_img_in = tf.cast(l_img_in, tf.float32) / 255 # if self.cnn is None: self.cnn = ConvNetwork( name=name + "cnn", input_shape=self.input_img_shape, output_dim=conv_output_dim, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=conv_hidden_sizes, hidden_nonlinearity=tf.nn.relu, output_nonlinearity=L.spatial_expected_softmax, input_var=l_normalized_img_in) # cnn = self.cnn # else: # self.cnn2 = ConvNetwork(name=name+"cnn2",input_shape=self.input_img_shape,output_dim=conv_output_dim, # conv_filters=conv_filters,conv_filter_sizes=conv_filter_sizes,conv_strides=conv_strides, # conv_pads=conv_pads, hidden_sizes=conv_hidden_sizes,hidden_nonlinearity=tf.nn.relu,output_nonlinearity=L.spatial_expected_softmax, input_var=l_normalized_img_in) # cnn = self.cnn2 # print("debug234, cnn output layer", L.get_output(self.cnn._l_out)) l_hid = tf.concat( [l_state_in, L.get_output(self.cnn.output_layer)], -1, 'post_conv_input') # l_hid = tf.concat([l_state_in,l_normalized_img_in],-1,'post_conv_input') # l_hid=l_in for idx in range(self.n_hidden): l_hid = forward_dense_layer( l_hid, all_params['W' + str(idx)], all_params['b' + str(idx)], batch_norm=batch_normalization, nonlinearity=self.hidden_nonlinearity, scope=str(idx), reuse=reuse, is_training=is_training) output = forward_dense_layer( l_hid, all_params['W' + str(self.n_hidden)], all_params['b' + str(self.n_hidden)], batch_norm=False, nonlinearity=self.output_nonlinearity, ) return l_in, output
def rllab_envpolicy_parser(env, args): if isinstance(args, dict): args = tonamedtuple(args) env = RLLabEnv(env, mode=args.control) if args.algo[:2] == 'tf': env = TfEnv(env) # Policy if args.recurrent: if args.feature_net: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=args.feature_output, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.action_space, Box): policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='policy', state_include_action=False if args.conv else True) else: raise NotImplementedError(env.spec.observation_space) elif args.recurrent == 'lstm': if isinstance(env.spec.action_space, Box): policy = GaussianLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=env.spec.action_space.n, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.policy_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: if isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std, name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) elif args.algo[:2] == 'th': # Policy if args.recurrent: if args.feature_net: feature_network = thMLP( input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.observation_space, thBox): policy = thGaussianGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), ) elif isinstance(env.spec.observation_space, thDiscrete): policy = thCategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), ) else: raise NotImplementedError(env.spec.observation_space) # elif args.recurrent == 'lstm': # if isinstance(env.spec.action_space, thBox): # policy = thGaussianLSTMPolicy(env_spec=env.spec, # feature_network=feature_network, # hidden_dim=int(args.policy_hidden), # name='policy') # elif isinstance(env.spec.action_space, thDiscrete): # policy = thCategoricalLSTMPolicy(env_spec=env.spec, # feature_network=feature_network, # hidden_dim=int(args.policy_hidden), # name='policy') # else: # raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) else: if args.algo == 'thddpg': assert isinstance(env.spec.action_space, thBox) policy = thDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=tuple(args.policy_hidden), ) else: if isinstance(env.spec.action_space, thBox): policy = thGaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std) elif isinstance(env.spec.action_space, thDiscrete): policy = thCategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std) else: raise NotImplementedError(env.spec.action_space) if args.control == 'concurrent': return env, policies else: return env, policy
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.99) parser.add_argument('--gae_lambda', type=float, default=1.0) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--rectangle', type=str, default='10,10') parser.add_argument('--map_type', type=str, default='rectangle') parser.add_argument('--n_evaders', type=int, default=5) parser.add_argument('--n_pursuers', type=int, default=2) parser.add_argument('--obs_range', type=int, default=3) parser.add_argument('--n_catch', type=int, default=2) parser.add_argument('--urgency', type=float, default=0.0) parser.add_argument('--pursuit', dest='train_pursuit', action='store_true') parser.add_argument('--evade', dest='train_pursuit', action='store_false') parser.set_defaults(train_pursuit=True) parser.add_argument('--surround', action='store_true', default=False) parser.add_argument('--constraint_window', type=float, default=1.0) parser.add_argument('--sample_maps', action='store_true', default=False) parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy') parser.add_argument('--flatten', action='store_true', default=False) parser.add_argument('--reward_mech', type=str, default='global') parser.add_argument('--catchr', type=float, default=0.1) parser.add_argument('--term_pursuit', type=float, default=5.0) parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--conv', action='store_true', default=False) parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--checkpoint', type=str, default=None) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) if args.checkpoint: with tf.Session() as sess: data = joblib.load(args.checkpoint) policy = data['policy'] env = data['env'] else: if args.sample_maps: map_pool = np.load(args.map_file) else: if args.map_type == 'rectangle': env_map = TwoDMaps.rectangle_map( *map(int, args.rectangle.split(','))) elif args.map_type == 'complex': env_map = TwoDMaps.complex_map( *map(int, args.rectangle.split(','))) else: raise NotImplementedError() map_pool = [env_map] env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers, obs_range=args.obs_range, n_catch=args.n_catch, train_pursuit=args.train_pursuit, urgency_reward=args.urgency, surround=args.surround, sample_maps=args.sample_maps, constraint_window=args.constraint_window, flatten=args.flatten, reward_mech=args.reward_mech, catchr=args.catchr, term_pursuit=args.term_pursuit) env = TfEnv( RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False), mode=args.control)) if args.recurrent: if args.conv: feature_network = ConvNetwork( name='feature_net', input_shape=emv.spec.observation_space.shape, output_dim=5, conv_filters=(16, 32, 32), conv_filter_sizes=(3, 3, 3), conv_strides=(1, 1, 1), conv_pads=('VALID', 'VALID', 'VALID'), hidden_sizes=(64, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) else: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=5, hidden_sizes=(256, 128, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = CategoricalLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.conv: feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=5, conv_filters=(8, 16), conv_filter_sizes=(3, 3), conv_strides=(2, 1), conv_pads=('VALID', 'VALID'), hidden_sizes=(32, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control, ) algo.train()
def parse_env_args(self, env, args): if isinstance(args, dict): args = to_named_tuple(args) # Multi-agent wrapper env = RLLabEnv(env, ma_mode=args.control) env = MATfEnv(env) # Policy if args.recurrent: if args.feature_net: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=args.feature_output, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.action_space, Box): if args.control == 'concurrent': policies = [ GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy') elif isinstance(env.spec.action_space, Discrete): if args.control == 'concurrent': policies = [ CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='policy_{}'.format(agid), state_include_action=False if args.conv else True) for agid in range(len(env.agents)) ] q_network = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='q_network', state_include_action=False if args.conv else True) target_q_network = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='target_q_network', state_include_action=False if args.conv else True) policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: raise NotImplementedError(env.spec.observation_space) elif args.recurrent == 'lstm': if isinstance(env.spec.action_space, Box): if args.control == 'concurrent': policies = [ GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] policy = GaussianLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') elif isinstance(env.spec.action_space, Discrete): if args.control == 'concurrent': policies = [ CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] q_network = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='q_network') target_q_network = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='target_q_network') policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=env.spec.action_space.n, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=(args.conv_pads, ) * len(chans), hidden_sizes=tuple(args.policy_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, batch_normalization=args.batch_normalization) if args.algo == 'dqn': q_network = CategoricalMLPPolicy(name='q_network', env_spec=env.spec, prob_network=feature_network) target_q_network = CategoricalMLPPolicy( name='target_q_network', env_spec=env.spec, prob_network=feature_network) policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: if env.spec is None: networks = [ DQNNetwork(i, env, target_network_update_freq=self.args. target_network_update, discount_factor=self.args.discount, batch_size=self.args.batch_size, learning_rate=self.args.qfunc_lr) for i in range(env.n) ] policy = networks elif isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std, name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) return env, policy
def __init__(self, name, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, mean_network=None, std_network=None, std_parametrization='exp'): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) with tf.variable_scope(name): obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.flat_dim, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=self.linear, name="prob_network", ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_std_param = std_network.output_layer else: if adaptive_std: std_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=self.linear, name="std_network", ) l_std_param = std_network.output_layer else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError l_std_param = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param # mean_var, log_std_var = L.get_output([l_mean, l_std_param]) # # if self.min_std_param is not None: # log_std_var = tf.maximum(log_std_var, np.log(min_std)) # # self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_std_param = l_std_param self._dist = DiagonalGaussian(action_dim) LayersPowered.__init__(self, [l_mean, l_std_param]) super(GaussianConvPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym( mean_network.input_layer.input_var, dict()) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._f_dist = tensor_utils.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def main(_): env = TfEnv( AtariEnv(args.env, force_reset=True, record_video=False, record_log=False, resize_size=args.resize_size, atari_noop=args.atari_noop, atari_eplife=args.atari_eplife, atari_firereset=args.atari_firereset)) policy_network = ConvNetwork( name='prob_network', input_shape=env.observation_space.shape, output_dim=env.action_space.n, # number of channels/filters for each conv layer conv_filters=(16, 32), # filter size conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pads=('VALID', 'VALID'), hidden_sizes=(256, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, batch_normalization=False) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=policy_network) if (args.value_function == 'zero'): baseline = ZeroBaseline(env.spec) else: value_network = get_value_network(env) baseline_batch_size = args.batch_size * 10 if (args.value_function == 'conj'): baseline_optimizer = ConjugateGradientOptimizer( subsample_factor=1.0, num_slices=args.num_slices) elif (args.value_function == 'adam'): baseline_optimizer = FirstOrderOptimizer( max_epochs=3, batch_size=512, num_slices=args.num_slices, verbose=True) else: logger.log("Inappropirate value function") exit(0) ''' baseline = GaussianMLPBaseline( env.spec, num_slices=args.num_slices, regressor_args=dict( step_size=0.01, mean_network=value_network, optimizer=baseline_optimizer, subsample_factor=1.0, batchsize=baseline_batch_size, use_trust_region=False ) ) ''' baseline = DeterministicMLPBaseline(env.spec, num_slices=args.num_slices, regressor_args=dict( network=value_network, optimizer=baseline_optimizer, normalize_inputs=False)) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=4500, n_itr=args.n_itr, discount=args.discount_factor, step_size=args.step_size, clip_reward=(not args.reward_no_scale), optimizer_args={ "subsample_factor": 1.0, "num_slices": args.num_slices } # plot=True ) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=args.n_cpu, inter_op_parallelism_threads=args.n_cpu) config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() algo.train(sess)