def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M: Mval}) logliks = calcloglik(Xval, Mval) entval_ll = -logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = -entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
def __init__(self, *args, **kwargs): self.args, self.kwargs = args, kwargs self.scope = self._initialize(*args, **kwargs) self.all_variables = tf.get_collection(tf.GraphKeys.VARIABLES, self.scope.name) self.trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope.name) self.num_params = sum(int(np.prod(v.get_shape().as_list())) for v in self.trainable_variables) self._setfromflat = U.SetFromFlat(self.trainable_variables) self._getflat = U.GetFlat(self.trainable_variables) logger.info('Trainable variables ({} parameters)'.format(self.num_params)) for v in self.trainable_variables: shp = v.get_shape().as_list() logger.info('- {} shape:{} size:{}'.format(v.name, shp, np.prod(shp))) logger.info('All variables') for v in self.all_variables: shp = v.get_shape().as_list() logger.info('- {} shape:{} size:{}'.format(v.name, shp, np.prod(shp))) placeholders = [tf.placeholder(v.value().dtype, v.get_shape().as_list()) for v in self.all_variables] self.set_all_vars = U.function( inputs=placeholders, outputs=[], updates=[tf.group(*[v.assign(p) for v, p in zip(self.all_variables, placeholders)])] )
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) #obz = ob #with tf.variable_scope("obfilter"): # self.ob_rms = RunningMeanStd(shape=ob_space.shape) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def make_update_exp(vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) expression = tf.group(*expression) return U.function([], [], updates=[expression])
def train(self, policy, S, A, epochs, batch_size): ac = tf.placeholder(name='expected_actions', dtype=tf.float32, shape=(None,A.shape[1],A.shape[2])) ob, actor = policy actor = tf.reshape(actor, shape=np.array([-1, A.shape[1], A.shape[2]])) error = tf.reduce_mean(0.5 * tf.square(actor - ac)) opt = tf.train.AdamOptimizer(learning_rate=3e-4).minimize(error) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) number_of_batches = S.shape[0]//batch_size sample_index = np.arange(S.shape[0]) for i in range(epochs): np.random.shuffle(sample_index) pbar = tqdm(range(number_of_batches)) for k in pbar: batch_index = sample_index[batch_size*k:batch_size*(k+1)] s_batch = S[batch_index,:] a_batch = A[batch_index,:] _, mse_run = sess.run([opt, error], feed_dict={ob: s_batch, ac: a_batch}) pbar.set_description("Loss %s" % str(mse_run)) return tf_util.function([ob], actor)
def load_policy(filename): with open(filename, 'rb') as f: data = pickle.loads(f.read()) # assert len(data.keys()) == 2 nonlin_type = data['nonlin_type'] policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) policy_params = data[policy_type] assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} # Keep track of input and output dims (i.e. observation and action dims) for the user def build_policy(obs_bo): def read_layer(l): assert list(l.keys()) == ['AffineLayer'] assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) def apply_nonlin(x): if nonlin_type == 'lrelu': return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 elif nonlin_type == 'tanh': return tf.tanh(x) else: raise NotImplementedError(nonlin_type) # Build the policy. First, observation normalization. assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation curr_activations_bd = normedobs_bo # Hidden layers next assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] layer_params = policy_params['hidden']['FeedforwardNet'] for layer_name in sorted(layer_params.keys()): l = layer_params[layer_name] W, b = read_layer(l) print(layer_name,W.shape,b.shape,nonlin_type) curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) # Output layer W, b = read_layer(policy_params['out']) output_bo = tf.matmul(curr_activations_bd, W) + b print('out',W.shape,b.shape,'None') return output_bo obs_bo = tf.placeholder(tf.float32, [None, None]) a_ba = build_policy(obs_bo) policy_fn = tf_util.function([obs_bo], a_ba) return policy_fn
def _initialize(self, ob_space, ac_space, ac_bins, ac_noise_std, nonlin_type, hidden_dims, connection_type): self.ac_space = ac_space self.ac_bins = ac_bins self.ac_noise_std = ac_noise_std self.hidden_dims = hidden_dims self.connection_type = connection_type assert len(ob_space.shape) == len(self.ac_space.shape) == 1 assert (np.all(np.isfinite(self.ac_space.low)) and np.all( np.isfinite(self.ac_space.high))), "Action bounds required" self.nonlin = { 'tanh': tf.tanh, 'relu': tf.nn.relu, 'lrelu': U.lrelu, 'elu': tf.nn.elu }[nonlin_type] with tf.variable_scope(type(self).__name__) as scope: # Observation normalization. ob_mean = tf.get_variable('ob_mean', ob_space.shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) ob_std = tf.get_variable('ob_std', ob_space.shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) in_mean = tf.placeholder(tf.float32, ob_space.shape) in_std = tf.placeholder(tf.float32, ob_space.shape) self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[ tf.assign(ob_mean, in_mean), tf.assign(ob_std, in_std), ]) # Policy network. o = tf.placeholder(tf.float32, [None] + list(ob_space.shape)) a = self._make_net( tf.clip_by_value((o - ob_mean) / ob_std, -5.0, 5.0)) self._act = U.function([o], a) return scope
def getPolicy(_weights, _biases): obs_bo = tf.placeholder(tf.float32, [None, None]) layer_1 = tf.nn.sigmoid( tf.add(tf.matmul(obs_bo, _weights['h1']), _biases['b1'])) layer_2 = tf.nn.sigmoid( tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) a_ba = tf.matmul(layer_2, _weights['out']) + _biases['out'] return tf_util.function([obs_bo], a_ba)
def load_policy(filename): with open(filename, 'rb') as f: data = pickle.loads(f.read()) # assert len(data.keys()) == 2 nonlin_type = data['nonlin_type'] policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) policy_params = data[policy_type] assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} # Keep track of input and output dims (i.e. observation and action dims) for the user def build_policy(obs_bo): def read_layer(l): assert list(l.keys()) == ['AffineLayer'] assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) def apply_nonlin(x): if nonlin_type == 'lrelu': return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 elif nonlin_type == 'tanh': return tf.tanh(x) else: raise NotImplementedError(nonlin_type) # Build the policy. First, observation normalization. assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) # print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation curr_activations_bd = normedobs_bo # Hidden layers next assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] layer_params = policy_params['hidden']['FeedforwardNet'] for layer_name in sorted(layer_params.keys()): l = layer_params[layer_name] W, b = read_layer(l) curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) # Output layer W, b = read_layer(policy_params['out']) output_bo = tf.matmul(curr_activations_bd, W) + b return output_bo obs_bo = tf.placeholder(tf.float32, [None, None]) a_ba = build_policy(obs_bo) policy_fn = tf_util.function([obs_bo], a_ba) return policy_fn
def _initialize(self, policy_dir, ob_space, ac_space, ac_bins, ac_noise_std, nonlin_type, hidden_dims, connection_type): self.policy_dir = policy_dir self.ac_space = ac_space self.ac_bins = ac_bins self.ac_noise_std = ac_noise_std self.hidden_dims = hidden_dims self.connection_type = connection_type if policy_dir: assert self.policy_dir.endswith('.h5') with h5py.File(self.policy_dir, 'r') as f: self.scope_name = f.attrs['name'] #print('scope_name: {}'.format(self.scope_name)) else: self.scope_name = type(self).__name__ + str(time.time()) ob_space = np.ones(732) ''' assert len(ob_space.shape) == len(self.ac_space.shape) == 1 assert np.all(np.isfinite(self.ac_space.low)) and np.all(np.isfinite(self.ac_space.high)), \ 'Action bounds required' ''' self.nonlin = {'sigmoid': tf.nn.sigmoid, 'tanh': tf.tanh, 'relu': tf.nn.relu, 'lrelu': U.lrelu, 'elu': tf.nn.elu}[nonlin_type] with tf.variable_scope(self.scope_name) as scope: # Observation normalization ob_mean = tf.get_variable( 'ob_mean', ob_space.shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) ob_std = tf.get_variable( 'ob_std', ob_space.shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) in_mean = tf.placeholder(tf.float32, ob_space.shape) in_std = tf.placeholder(tf.float32, ob_space.shape) self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[ tf.assign(ob_mean, in_mean), tf.assign(ob_std, in_std), ]) # Policy network o = tf.placeholder(tf.float32, [None] + list(ob_space.shape)) a = self._make_net(o) #tf.clip_by_value((o - ob_mean) / ob_std, -5.0, 5.0)) self._act = U.function([o], a) return scope
def __init__(self, num_features, num_actions, timestep, action_space, scope): self.scope = scope self._lr = 0.5 self.discount = 1. self.replay_buffer = ReplayBuffer(1e4) with tf.variable_scope(self.scope): self.act_trajectory = tf.placeholder(tf.float32, shape = ((None, timestep, action_space))) self.target = tf.placeholder(tf.float32, shape = ((None, ))) self.act = tf.placeholder(tf.int32, shape = ((None,))) self.tau = lstm_model(self.act_trajectory, num_actions, scope = "tau_model_{}".format(scope)) self.q_input = self.tau #train network self.q = mlp_model(self.q_input, 2, scope = "q_model_{}".format(scope)) q_func_vars = U.scope_vars(U.absolute_scope_name( "q_model_{}".format(scope))) #target network self.target_q = mlp_model(self.q_input, 2, scope = "target_q_model_{}".format(scope)) target_q_func_vars = U.scope_vars(U.absolute_scope_name( "target_q_model_{}".format(scope))) # take action self.softmax = tf.nn.softmax(self.target_q) self.pred = tf.argmax(self.softmax, axis = 1) #calculate the loss self.q_t_selected = tf.reduce_mean(self.q * tf.one_hot(self.act, num_actions), 1) q_tp1_best = tf.reduce_max(self.q, 1) q_tp1_best_masked = q_tp1_best td_error = self.q_t_selected - tf.stop_gradient(self.target) self.errors = U.huber_loss(td_error) self.q_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.errors, var_list = q_func_vars) self.tau_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.tau, labels=self.act)) self.tau_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.tau_loss) self.get_pred = U.function(inputs = [self.act_trajectory] , outputs = [self.softmax]) self.train_q = U.function(inputs = [self.act_trajectory] + [self.target] +[self.act] , outputs = [self.errors, self.q], updates = [self.q_opt_op]) self.train_tau = U.function(inputs =[ self.act] + [self.act_trajectory], outputs = [self.tau_loss], updates =[ self.tau_opt_op ]) self.update_model = make_update_exp(q_func_vars, target_q_func_vars)
def run(args): data_file = os.path.join('expert_data', args.env_name + ".pkl") data, input_size, output_size = load_data(data_file) model = Model(input_size, output_size, configs[args.env_name]) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, os.path.join('models', args.env_name)) policy_fn = tf_util.function([model.input], model.output) #test_in = data['observations'][0] #action = policy_fn(test_in[None, :]) gym_util.run_gym(args.env_name, policy_fn, num_rollouts=10)
def main(): parser = argparse.ArgumentParser() parser.add_argument('env', type=str) parser.add_argument('--model_checkpoint', type=str) parser.add_argument('--render', type=bool, default=True) parser.add_argument('--max_timesteps', type=int) parser.add_argument('--num_rollouts', type=int, default=10) args = parser.parse_args() with tf.Session() as sess: with tf.variable_scope(args.env): input_dim, output_dim = helper.input_output_shape(args.env) model = helper.build_model(input_dim, output_dim) input_ph, output_pred = model['input_ph'], model['output_pred'] policy_fn = tf_util.function([input_ph], output_pred) if args.model_checkpoint: checkpoint_path = args.model_checkpoint else: checkpoint_path = helper.checkpoint_path(args.env) saver = tf.train.Saver() saver.restore(sess, checkpoint_path) env = gym.make(helper.envname(args.env)) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0 steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps >= max_steps: break returns.append(totalr) helper.print_returns_stats(returns)
def __init__(self, epsilon=1e-2, shape=(), name=None): """ calulates the running mean and std of a data stream https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm :param epsilon: (float) helps with arithmetic issues :param shape: (tuple) the shape of the data stream's output """ with tf.variable_scope(name): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=FCalse) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = tf_util.function( [newsum, newsumsq, newcount], [], updates=[ tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount) ])
def policy_fn(obs): with open('bc_policy/bc_weights.pkl', 'rb') as f: bc_weights = pickle.loads(f.read()) with open('bc_policy/bc_biases.pkl', 'rb') as f: bc_biases = pickle.loads(f.read()) obs_bo = tf.placeholder(tf.float32, [None, None]) layer_1 = tf.nn.sigmoid( tf.add(tf.matmul(obs_bo, bc_weights['h1']), bc_biases['b1'])) layer_2 = tf.nn.sigmoid( tf.add(tf.matmul(layer_1, bc_weights['h2']), bc_biases['b2'])) a_ba = tf.matmul(layer_2, bc_weights['out']) + bc_biases['out'] policy_fn = tf_util.function([obs_bo], a_ba) return policy_fn(obs)
def train(sess, data, model, curr_epoch, batch_size=32, debug=False, checkpoint_path=None): obs = None for _data in data: if obs is None: obs = _data['observations'] else: obs = np.concatenate((obs, _data['observations'])) mean, stdev = helper.mean_and_stdev(obs) empty_action = np.array([0] * 17) m = model input_ph, output_ph = m['input_ph'], m['output_ph'] mean_v, stdev_v = m['mean_v'], m['stdev_v'] output_pred, mse, opt = m['output_pred'], m['mse'], m['opt'] S, initial_state, state = m['S'], m['initial_state'], m['state'] mean_v.load(mean, session=sess) stdev_v.load(stdev, session=sess) if checkpoint_path: saver = tf.train.Saver() for _data in data: idx = 0 lstm_state = initial_state if len(_data['observations']) % 32 > 0: rep = 32 - (len(_data['observations']) % 32) mean_stk = np.tile(mean[None, :], [rep, 1]) output_stk = np.tile(empty_action[None, :], [rep, 1]) _data['observations'] = np.concatenate((mean_stk, _data['observations'])) _data['actions'] = np.concatenate((output_stk, _data['actions'])) while idx < len(_data['observations']): input_batch = _data['observations'][idx : idx+batch_size] output_batch = _data['actions'][idx : idx+batch_size] _, mse_run, lstm_state = sess.run([opt, mse, state], feed_dict={input_ph: input_batch, output_ph: output_batch, S: lstm_state}) idx += batch_size print('epoch: {0:03d} mse: {1:.4f}'.format(curr_epoch, mse_run)) if checkpoint_path: saver.save(sess, checkpoint_path) policy_fn = tf_util.function([input_ph, S], [output_pred, state]) return policy_fn, initial_state, mean
def load_policy(filename): with open(filename, 'rb') as f: data = pickle.loads(f.read()) # data should be a dict with 2 keys: 'GaussianPolicy' and 'nonlin_type' # print(data) nonlin_type = data['nonlin_type'] policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) policy_params = data[policy_type] # print(policy_params.keys()) # print(policy_params['obsnorm']) assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} obs_bo = tf.placeholder(tf.float32, [None, None]) a_ba = build_policy(obs_bo, policy_params, nonlin_type) policy_fn = tf_util.function([obs_bo], a_ba) return policy_fn
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None sy_ob = U.get_placeholder(name="sy_ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = sy_ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) sy_ac = self.pd.sample() # XXX self._act = U.function([stochastic, sy_ob], [sy_ac, self.vpred])
def train(self, policy, S, A, epochs, batch_size): ob, ac, opt, error, actor = policy sess = tf.get_default_session() number_of_batches = S.shape[0] // batch_size sample_index = np.arange(S.shape[0]) for i in range(epochs): np.random.shuffle(sample_index) pbar = tqdm(range(number_of_batches)) for k in pbar: batch_index = sample_index[batch_size * k:batch_size * (k + 1)] s_batch = S[batch_index, :] a_batch = A[batch_index, :] _, mse_run = sess.run([opt, error], feed_dict={ ob: s_batch, ac: a_batch }) pbar.set_description("Loss %s" % str(mse_run)) return tf_util.function([ob], actor)
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = U.function( inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) def act(ob, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps) return act
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() U.load_state("save/Humanoid-v1") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] #if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() U.save_state("save/Humanoid-v1")
def test_net(model, img_dir, max_iter=1000000, check_every_n=500, loss_check_n=10, save_model_freq=1000, batch_size=128): img1 = U.get_placeholder_cached(name="img1") img2 = U.get_placeholder_cached(name="img2") # Testing img_test = U.get_placeholder_cached(name="img_test") reconst_tp = U.get_placeholder_cached(name="reconst_tp") vae_loss = U.mean(model.vaeloss) latent_z1_tp = model.latent_z1 latent_z2_tp = model.latent_z2 losses = [ U.mean(model.vaeloss), U.mean(model.siam_loss), U.mean(model.kl_loss1), U.mean(model.kl_loss2), U.mean(model.reconst_error1), U.mean(model.reconst_error2), ] tf.summary.scalar('Total Loss', losses[0]) tf.summary.scalar('Siam Loss', losses[1]) tf.summary.scalar('kl1_loss', losses[2]) tf.summary.scalar('kl2_loss', losses[3]) tf.summary.scalar('reconst_err1', losses[4]) tf.summary.scalar('reconst_err2', losses[5]) decoded_img = [model.reconst1, model.reconst2] weight_loss = [1, 1, 1] compute_losses = U.function([img1, img2], vae_loss) lr = 0.00005 optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=0.01 / batch_size) all_var_list = model.get_trainable_variables() # print all_var_list img1_var_list = all_var_list #[v for v in all_var_list if v.name.split("/")[1].startswith("proj1") or v.name.split("/")[1].startswith("unproj1")] optimize_expr1 = optimizer.minimize(vae_loss, var_list=img1_var_list) merged = tf.summary.merge_all() train = U.function([img1, img2], [ losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged ], updates=[optimize_expr1]) get_reconst_img = U.function( [img1, img2], [model.reconst1, model.reconst2, latent_z1_tp, latent_z2_tp]) get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp]) # [testing -> ] test = U.function([img_test], model.latent_z_test) test_reconst = U.function([reconst_tp], [model.reconst_test]) # [testing <- ] cur_dir = get_cur_dir() chk_save_dir = os.path.join(cur_dir, "chk1") log_save_dir = os.path.join(cur_dir, "log") validate_img_saver_dir = os.path.join(cur_dir, "validate_images") test_img_saver_dir = os.path.join(cur_dir, "test_images") testing_img_dir = os.path.join(cur_dir, "dataset/test_img") train_writer = U.summary_writer(dir=log_save_dir) U.initialize() saver, chk_file_num = U.load_checkpoints(load_requested=True, checkpoint_dir=chk_save_dir) validate_img_saver = Img_Saver(validate_img_saver_dir) # [testing -> ] test_img_saver = Img_Saver(test_img_saver_dir) # [testing <- ] meta_saved = False iter_log = [] loss1_log = [] loss2_log = [] loss3_log = [] training_images_list = read_dataset(img_dir) n_total_train_data = len(training_images_list) testing_images_list = read_dataset(testing_img_dir) n_total_testing_data = len(testing_images_list) training = False testing = True # if training == True: # for num_iter in range(chk_file_num+1, max_iter): # header("******* {}th iter: *******".format(num_iter)) # idx = random.sample(range(n_total_train_data), 2*batch_size) # batch_files = [training_images_list[i] for i in idx] # # print batch_files # [images1, images2] = load_image(dir_name = img_dir, img_names = batch_files) # img1, img2 = images1, images2 # [l1, l2, _, _] = get_reconst_img(img1, img2) # [loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2) # warn("Total Loss: {}".format(loss0)) # warn("Siam loss: {}".format(loss1)) # warn("kl1_loss: {}".format(loss2)) # warn("kl2_loss: {}".format(loss3)) # warn("reconst_err1: {}".format(loss4)) # warn("reconst_err2: {}".format(loss5)) # # warn("num_iter: {} check: {}".format(num_iter, check_every_n)) # # warn("Total Loss: {}".format(loss6)) # if num_iter % check_every_n == 1: # header("******* {}th iter: *******".format(num_iter)) # idx = random.sample(range(len(training_images_list)), 2*5) # validate_batch_files = [training_images_list[i] for i in idx] # [images1, images2] = load_image(dir_name = img_dir, img_names = validate_batch_files) # [reconst1, reconst2, _, _] = get_reconst_img(images1, images2) # # for i in range(len(latent1[0])): # # print "{} th: {:.2f}".format(i, np.mean(np.abs(latent1[:, i] - latent2[:, i]))) # for img_idx in range(len(images1)): # sub_dir = "iter_{}".format(num_iter) # save_img = np.squeeze(images1[img_idx]) # save_img = Image.fromarray(save_img) # img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0]) # validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) # save_img = np.squeeze(reconst1[img_idx]) # save_img = Image.fromarray(save_img) # img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0]) # validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) # if num_iter % loss_check_n == 1: # train_writer.add_summary(summary, num_iter) # if num_iter > 11 and num_iter % save_model_freq == 1: # if meta_saved == True: # saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = False) # else: # print "Save meta graph" # saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = True) # meta_saved = True # Testing print testing_images_list if testing == True: test_file_name = testing_images_list[6] print test_file_name test_img = load_single_img(dir_name=testing_img_dir, img_name=test_file_name) test_features = np.arange(25, 32) for test_feature in test_features: test_variation = np.arange(-10, 10, 0.1) z = test(test_img) print np.shape(z) print z for idx in range(len(test_variation)): z_test = np.copy(z) z_test[0, test_feature] = z_test[ 0, test_feature] + test_variation[idx] reconst_test = test_reconst(z_test) test_save_img = np.squeeze(reconst_test[0]) test_save_img = Image.fromarray(test_save_img) img_file_name = "test_feat_{}_var_({}).png".format( test_feature, test_variation[idx]) test_img_saver.save(test_save_img, img_file_name, sub_dir=None) reconst_test = test_reconst(z) test_save_img = np.squeeze(reconst_test[0]) test_save_img = Image.fromarray(test_save_img) img_file_name = "test_feat_{}_var_original.png".format( test_feature) test_img_saver.save(test_save_img, img_file_name, sub_dir=None)
obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation curr_activations_bd = normedobs_bo # Hidden layers next assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] layer_params = policy_params['hidden']['FeedforwardNet'] for layer_name in sorted(layer_params.keys()): l = layer_params[layer_name] W, b = read_layer(l) curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) <<<<<<< HEAD print("reading layer ", layer_name, " weights ", W.shape, " bias ", b.shape) ======= >>>>>>> e82c0ba0166126de9dfb8be3bc5a2670e178714d # Output layer W, b = read_layer(policy_params['out']) output_bo = tf.matmul(curr_activations_bd, W) + b <<<<<<< HEAD print("reading output layer weights ", W.shape, " bias ", b.shape) ======= >>>>>>> e82c0ba0166126de9dfb8be3bc5a2670e178714d return output_bo obs_bo = tf.placeholder(tf.float32, [None, None]) a_ba = build_policy(obs_bo) policy_fn = tf_util.function([obs_bo], a_ba) return policy_fn
def train_net(model, mode, img_dir, dataset, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_epoch = 300, check_every_n = 500, loss_check_n = 10, save_model_freq = 5, batch_size = 512, lr = 0.001): img1 = U.get_placeholder_cached(name="img1") img2 = U.get_placeholder_cached(name="img2") vae_loss = U.mean(model.vaeloss) latent_z1_tp = model.latent_z1 latent_z2_tp = model.latent_z2 losses = [U.mean(model.vaeloss), U.mean(model.siam_loss), U.mean(model.kl_loss1), U.mean(model.kl_loss2), U.mean(model.reconst_error1), U.mean(model.reconst_error2), ] siam_normal = losses[1]/entangled_feat siam_max = U.mean(model.max_siam_loss) tf.summary.scalar('Total Loss', losses[0]) tf.summary.scalar('Siam Loss', losses[1]) tf.summary.scalar('kl1_loss', losses[2]) tf.summary.scalar('kl2_loss', losses[3]) tf.summary.scalar('reconst_err1', losses[4]) tf.summary.scalar('reconst_err2', losses[5]) tf.summary.scalar('Siam Normal', siam_normal) tf.summary.scalar('Siam Max', siam_max) compute_losses = U.function([img1, img2], vae_loss) optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon = 0.01/batch_size) all_var_list = model.get_trainable_variables() img1_var_list = all_var_list optimize_expr1 = optimizer.minimize(vae_loss, var_list=img1_var_list) merged = tf.summary.merge_all() train = U.function([img1, img2], [losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1]) get_reconst_img = U.function([img1, img2], [model.reconst1, model.reconst2, latent_z1_tp, latent_z2_tp]) get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp]) cur_dir = get_cur_dir() chk_save_dir = os.path.join(cur_dir, chkfile_name) log_save_dir = os.path.join(cur_dir, logfile_name) validate_img_saver_dir = os.path.join(cur_dir, validatefile_name) if dataset == 'chairs' or dataset == 'celeba': test_img_saver_dir = os.path.join(cur_dir, "test_images") testing_img_dir = os.path.join(cur_dir, "dataset/{}/test_img".format(dataset)) train_writer = U.summary_writer(dir = log_save_dir) U.initialize() saver, chk_file_epoch_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir) if dataset == 'chairs' or dataset == 'celeba': validate_img_saver = Img_Saver(Img_dir = validate_img_saver_dir) elif dataset == 'dsprites': validate_img_saver = BW_Img_Saver(Img_dir = validate_img_saver_dir) # Black and White, temporary usage else: warn("Unknown dataset Error") # break warn(img_dir) if dataset == 'chairs' or dataset == 'celeba': training_images_list = read_dataset(img_dir) n_total_train_data = len(training_images_list) testing_images_list = read_dataset(testing_img_dir) n_total_testing_data = len(testing_images_list) elif dataset == 'dsprites': cur_dir = osp.join(cur_dir, 'dataset') cur_dir = osp.join(cur_dir, 'dsprites') img_dir = osp.join(cur_dir, 'dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz') manager = DataManager(img_dir, batch_size) else: warn("Unknown dataset Error") # break meta_saved = False if mode == 'train': for epoch_idx in range(chk_file_epoch_num+1, max_epoch): t_epoch_start = time.time() num_batch = manager.get_len() for batch_idx in range(num_batch): if dataset == 'chairs' or dataset == 'celeba': idx = random.sample(range(n_total_train_data), 2*batch_size) batch_files = [training_images_list[i] for i in idx] [images1, images2] = load_image(dir_name = img_dir, img_names = batch_files) elif dataset == 'dsprites': [images1, images2] = manager.get_next() img1, img2 = images1, images2 [l1, l2, _, _] = get_reconst_img(img1, img2) [loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2) if batch_idx % 50 == 1: header("******* epoch: {}/{} batch: {}/{} *******".format(epoch_idx, max_epoch, batch_idx, num_batch)) warn("Total Loss: {}".format(loss0)) warn("Siam loss: {}".format(loss1)) warn("kl1_loss: {}".format(loss2)) warn("kl2_loss: {}".format(loss3)) warn("reconst_err1: {}".format(loss4)) warn("reconst_err2: {}".format(loss5)) if batch_idx % check_every_n == 1: if dataset == 'chairs' or dataset == 'celeba': idx = random.sample(range(len(training_images_list)), 2*5) validate_batch_files = [training_images_list[i] for i in idx] [images1, images2] = load_image(dir_name = img_dir, img_names = validate_batch_files) elif dataset == 'dsprites': [images1, images2] = manager.get_next() [reconst1, reconst2, _, _] = get_reconst_img(images1, images2) if dataset == 'chairs': for img_idx in range(len(images1)): sub_dir = "iter_{}".format(batch_idx) save_img = np.squeeze(images1[img_idx]) save_img = Image.fromarray(save_img) img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) save_img = np.squeeze(reconst1[img_idx]) save_img = Image.fromarray(save_img) img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) elif dataset == 'celeba': for img_idx in range(len(images1)): sub_dir = "iter_{}".format(batch_idx) save_img = np.squeeze(images1[img_idx]) save_img = Image.fromarray(save_img, 'RGB') img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) save_img = np.squeeze(reconst1[img_idx]) save_img = Image.fromarray(save_img, 'RGB') img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) elif dataset == 'dsprites': for img_idx in range(len(images1)): sub_dir = "iter_{}".format(batch_idx) # save_img = images1[img_idx].reshape(64, 64) save_img = np.squeeze(images1[img_idx]) save_img = save_img.astype(np.float32) img_file_name = "{}_ori.jpg".format(img_idx) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) # save_img = reconst1[img_idx].reshape(64, 64) save_img = np.squeeze(reconst1[img_idx]) save_img = save_img.astype(np.float32) img_file_name = "{}_rec.jpg".format(img_idx) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) if batch_idx % loss_check_n == 1: train_writer.add_summary(summary, batch_idx) t_epoch_end = time.time() t_epoch_run = t_epoch_end - t_epoch_start if dataset == 'dsprites': t_check = manager.sample_size / t_epoch_run warn("==========================================") warn("Run {} th epoch in {} sec: {} images / sec".format(epoch_idx+1, t_epoch_run, t_check)) warn("==========================================") # if epoch_idx % save_model_freq == 0: if meta_saved == True: saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = False) else: print "Save meta graph" saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = True) meta_saved = True # Testing elif mode == 'test': test_file_name = testing_images_list[0] test_img = load_single_img(dir_name = testing_img_dir, img_name = test_file_name) test_feature = 31 test_variation = np.arange(-5, 5, 0.1) z = test(test_img) for idx in range(len(test_variation)): z_test = np.copy(z) z_test[0, test_feature] = z_test[0, test_feature] + test_variation[idx] reconst_test = test_reconst(z_test) test_save_img = np.squeeze(reconst_test[0]) test_save_img = Image.fromarray(test_save_img) img_file_name = "test_feat_{}_var_({}).png".format(test_feature, test_variation[idx]) test_img_saver.save(test_save_img, img_file_name, sub_dir = None) reconst_test = test_reconst(z) test_save_img = np.squeeze(reconst_test[0]) test_save_img = Image.fromarray(test_save_img) img_file_name = "test_feat_{}_var_original.png".format(test_feature) test_img_saver.save(test_save_img, img_file_name, sub_dir = None)
def mgpu_classifier_train_net(models, num_gpus, cls_batch_per_gpu, cls_L, mode, img_dir, dataset, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_epoch = 300, check_every_n = 500, loss_check_n = 10, save_model_freq = 5, batch_size = 512, lr = 0.001): img1 = U.get_placeholder_cached(name="img1") img2 = U.get_placeholder_cached(name="img2") feat_cls = U.get_placeholder_cached(name="feat_cls") # batch size must be multiples of ntowers (# of GPUs) ntowers = len(models) tf.assert_equal(tf.shape(img1)[0], tf.shape(img2)[0]) tf.assert_equal(tf.floormod(tf.shape(img1)[0], ntowers), 0) img1splits = tf.split(img1, ntowers, 0) img2splits = tf.split(img2, ntowers, 0) tower_vae_loss = [] tower_latent_z1_tp = [] tower_latent_z2_tp = [] tower_losses = [] tower_siam_max = [] tower_reconst1 = [] tower_reconst2 = [] tower_cls_loss = [] for gid, model in enumerate(models): with tf.name_scope('gpu%d' % gid) as scope: with tf.device('/gpu:%d' % gid): vae_loss = U.mean(model.vaeloss) latent_z1_tp = model.latent_z1 latent_z2_tp = model.latent_z2 losses = [U.mean(model.vaeloss), U.mean(model.siam_loss), U.mean(model.kl_loss1), U.mean(model.kl_loss2), U.mean(model.reconst_error1), U.mean(model.reconst_error2), ] siam_max = U.mean(model.max_siam_loss) cls_loss = U.mean(model.cls_loss) tower_vae_loss.append(vae_loss) tower_latent_z1_tp.append(latent_z1_tp) tower_latent_z2_tp.append(latent_z2_tp) tower_losses.append(losses) tower_siam_max.append(siam_max) tower_reconst1.append(model.reconst1) tower_reconst2.append(model.reconst2) tower_cls_loss.append(cls_loss) tf.summary.scalar('Cls Loss', cls_loss) vae_loss = U.mean(tower_vae_loss) siam_max = U.mean(tower_siam_max) latent_z1_tp = tf.concat(tower_latent_z1_tp, 0) latent_z2_tp = tf.concat(tower_latent_z2_tp, 0) model_reconst1 = tf.concat(tower_reconst1, 0) model_reconst2 = tf.concat(tower_reconst2, 0) cls_loss = U.mean(tower_cls_loss) losses = [[] for _ in range(len(losses))] for tl in tower_losses: for i, l in enumerate(tl): losses[i].append(l) losses = [U.mean(l) for l in losses] siam_normal = losses[1] / entangled_feat tf.summary.scalar('total/cls_loss', cls_loss) compute_losses = U.function([img1, img2], vae_loss) all_var_list = model.get_trainable_variables() vae_var_list = [v for v in all_var_list if v.name.split("/")[2].startswith("vae")] cls_var_list = [v for v in all_var_list if v.name.split("/")[2].startswith("cls")] warn("{}".format(all_var_list)) warn("=======================") warn("{}".format(vae_var_list)) warn("=======================") warn("{}".format(cls_var_list)) # with tf.device('/cpu:0'): # optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon = 0.01/batch_size) # optimize_expr1 = optimizer.minimize(vae_loss, var_list=vae_var_list) feat_cls_optimizer = tf.train.AdagradOptimizer(learning_rate=0.01) optimize_expr2 = feat_cls_optimizer.minimize(cls_loss, var_list=cls_var_list) merged = tf.summary.merge_all() # train = U.function([img1, img2], # [losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1]) classifier_train = U.function([img1, img2, feat_cls], [cls_loss, latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr2]) get_reconst_img = U.function([img1, img2], [model_reconst1, model_reconst2, latent_z1_tp, latent_z2_tp]) get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp]) cur_dir = get_cur_dir() chk_save_dir = os.path.join(cur_dir, chkfile_name) log_save_dir = os.path.join(cur_dir, logfile_name) cls_logfile_name = 'cls_{}'.format(logfile_name) cls_log_save_dir = os.path.join(cur_dir, cls_logfile_name) validate_img_saver_dir = os.path.join(cur_dir, validatefile_name) if dataset == 'chairs' or dataset == 'celeba': test_img_saver_dir = os.path.join(cur_dir, "test_images") testing_img_dir = os.path.join(cur_dir, "dataset/{}/test_img".format(dataset)) cls_train_writer = U.summary_writer(dir = cls_log_save_dir) U.initialize() saver, chk_file_epoch_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir) if dataset == 'chairs' or dataset == 'celeba': validate_img_saver = Img_Saver(Img_dir = validate_img_saver_dir) elif dataset == 'dsprites': validate_img_saver = BW_Img_Saver(Img_dir = validate_img_saver_dir) # Black and White, temporary usage else: warn("Unknown dataset Error") # break warn("dataset: {}".format(dataset)) if dataset == 'chairs' or dataset == 'celeba': training_images_list = read_dataset(img_dir) n_total_train_data = len(training_images_list) testing_images_list = read_dataset(testing_img_dir) n_total_testing_data = len(testing_images_list) elif dataset == 'dsprites': cur_dir = osp.join(cur_dir, 'dataset') cur_dir = osp.join(cur_dir, 'dsprites') img_dir = osp.join(cur_dir, 'dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz') manager = DataManager(img_dir, batch_size) else: warn("Unknown dataset Error") # break meta_saved = False cls_train_iter = 10000 for cls_train_i in range(cls_train_iter): # warn("Train:{}".format(cls_train_i)) if dataset == 'dsprites': # At every epoch, train classifier and check result # (1) Load images num_img_pair = cls_L * num_gpus * cls_batch_per_gpu # warn("{} {} {}".format(len(manager.latents_sizes)-1, num_gpus, cls_batch_per_gpu)) feat = np.random.randint(len(manager.latents_sizes)-1, size = num_gpus * cls_batch_per_gpu) [images1, images2] = manager.get_image_fixed_feat_batch(feat, num_img_pair) # warn("images shape:{}".format(np.shape(images1))) # (2) Input PH images [classification_loss, _, _, summary] = classifier_train(images1, images2, feat) if cls_train_i % 100 == 0: warn("cls loss {}: {}".format(cls_train_i, classification_loss)) cls_train_writer.add_summary(summary, cls_train_i)
def mgpu_train_net(models, num_gpus, mode, img_dir, dataset, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_epoch = 300, check_every_n = 500, loss_check_n = 10, save_model_freq = 5, batch_size = 512, lr = 0.001): img1 = U.get_placeholder_cached(name="img1") img2 = U.get_placeholder_cached(name="img2") feat_cls = U.get_placeholder_cached(name="feat_cls") # batch size must be multiples of ntowers (# of GPUs) ntowers = len(models) tf.assert_equal(tf.shape(img1)[0], tf.shape(img2)[0]) tf.assert_equal(tf.floormod(tf.shape(img1)[0], ntowers), 0) img1splits = tf.split(img1, ntowers, 0) img2splits = tf.split(img2, ntowers, 0) tower_vae_loss = [] tower_latent_z1_tp = [] tower_latent_z2_tp = [] tower_losses = [] tower_siam_max = [] tower_reconst1 = [] tower_reconst2 = [] tower_cls_loss = [] for gid, model in enumerate(models): with tf.name_scope('gpu%d' % gid) as scope: with tf.device('/gpu:%d' % gid): vae_loss = U.mean(model.vaeloss) latent_z1_tp = model.latent_z1 latent_z2_tp = model.latent_z2 losses = [U.mean(model.vaeloss), U.mean(model.siam_loss), U.mean(model.kl_loss1), U.mean(model.kl_loss2), U.mean(model.reconst_error1), U.mean(model.reconst_error2), ] siam_max = U.mean(model.max_siam_loss) cls_loss = U.mean(model.cls_loss) tower_vae_loss.append(vae_loss) tower_latent_z1_tp.append(latent_z1_tp) tower_latent_z2_tp.append(latent_z2_tp) tower_losses.append(losses) tower_siam_max.append(siam_max) tower_reconst1.append(model.reconst1) tower_reconst2.append(model.reconst2) tower_cls_loss.append(cls_loss) tf.summary.scalar('Total Loss', losses[0]) tf.summary.scalar('Siam Loss', losses[1]) tf.summary.scalar('kl1_loss', losses[2]) tf.summary.scalar('kl2_loss', losses[3]) tf.summary.scalar('reconst_err1', losses[4]) tf.summary.scalar('reconst_err2', losses[5]) tf.summary.scalar('Siam Max', siam_max) vae_loss = U.mean(tower_vae_loss) siam_max = U.mean(tower_siam_max) latent_z1_tp = tf.concat(tower_latent_z1_tp, 0) latent_z2_tp = tf.concat(tower_latent_z2_tp, 0) model_reconst1 = tf.concat(tower_reconst1, 0) model_reconst2 = tf.concat(tower_reconst2, 0) cls_loss = U.mean(tower_cls_loss) losses = [[] for _ in range(len(losses))] for tl in tower_losses: for i, l in enumerate(tl): losses[i].append(l) losses = [U.mean(l) for l in losses] siam_normal = losses[1] / entangled_feat tf.summary.scalar('total/Total Loss', losses[0]) tf.summary.scalar('total/Siam Loss', losses[1]) tf.summary.scalar('total/kl1_loss', losses[2]) tf.summary.scalar('total/kl2_loss', losses[3]) tf.summary.scalar('total/reconst_err1', losses[4]) tf.summary.scalar('total/reconst_err2', losses[5]) tf.summary.scalar('total/Siam Normal', siam_normal) tf.summary.scalar('total/Siam Max', siam_max) compute_losses = U.function([img1, img2], vae_loss) all_var_list = model.get_trainable_variables() vae_var_list = [v for v in all_var_list if v.name.split("/")[2].startswith("vae")] cls_var_list = [v for v in all_var_list if v.name.split("/")[2].startswith("cls")] warn("{}".format(all_var_list)) warn("==========================") warn("{}".format(vae_var_list)) # warn("==========================") # warn("{}".format(cls_var_list)) # with tf.device('/cpu:0'): optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon = 0.01/batch_size) optimize_expr1 = optimizer.minimize(vae_loss, var_list=vae_var_list) feat_cls_optimizer = tf.train.AdagradOptimizer(learning_rate=0.01) optimize_expr2 = feat_cls_optimizer.minimize(cls_loss, var_list=cls_var_list) merged = tf.summary.merge_all() train = U.function([img1, img2], [losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1]) get_reconst_img = U.function([img1, img2], [model_reconst1, model_reconst2, latent_z1_tp, latent_z2_tp]) get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp]) cur_dir = get_cur_dir() chk_save_dir = os.path.join(cur_dir, chkfile_name) log_save_dir = os.path.join(cur_dir, logfile_name) validate_img_saver_dir = os.path.join(cur_dir, validatefile_name) if dataset == 'chairs' or dataset == 'celeba': test_img_saver_dir = os.path.join(cur_dir, "test_images") testing_img_dir = os.path.join(cur_dir, "dataset/{}/test_img".format(dataset)) train_writer = U.summary_writer(dir = log_save_dir) U.initialize() saver, chk_file_epoch_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir) if dataset == 'chairs' or dataset == 'celeba': validate_img_saver = Img_Saver(Img_dir = validate_img_saver_dir) elif dataset == 'dsprites': validate_img_saver = BW_Img_Saver(Img_dir = validate_img_saver_dir) # Black and White, temporary usage else: warn("Unknown dataset Error") # break warn("dataset: {}".format(dataset)) if dataset == 'chairs' or dataset == 'celeba': training_images_list = read_dataset(img_dir) n_total_train_data = len(training_images_list) testing_images_list = read_dataset(testing_img_dir) n_total_testing_data = len(testing_images_list) elif dataset == 'dsprites': cur_dir = osp.join(cur_dir, 'dataset') cur_dir = osp.join(cur_dir, 'dsprites') img_dir = osp.join(cur_dir, 'dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz') manager = DataManager(img_dir, batch_size) else: warn("Unknown dataset Error") # break meta_saved = False if mode == 'train': for epoch_idx in range(chk_file_epoch_num+1, max_epoch): t_epoch_start = time.time() num_batch = manager.get_len() for batch_idx in range(num_batch): if dataset == 'chairs' or dataset == 'celeba': idx = random.sample(range(n_total_train_data), 2*batch_size) batch_files = [training_images_list[i] for i in idx] [images1, images2] = load_image(dir_name = img_dir, img_names = batch_files) elif dataset == 'dsprites': [images1, images2] = manager.get_next() img1, img2 = images1, images2 [l1, l2, _, _] = get_reconst_img(img1, img2) [loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2) if batch_idx % 50 == 1: header("******* epoch: {}/{} batch: {}/{} *******".format(epoch_idx, max_epoch, batch_idx, num_batch)) warn("Total Loss: {}".format(loss0)) warn("Siam loss: {}".format(loss1)) warn("kl1_loss: {}".format(loss2)) warn("kl2_loss: {}".format(loss3)) warn("reconst_err1: {}".format(loss4)) warn("reconst_err2: {}".format(loss5)) if batch_idx % check_every_n == 1: if dataset == 'chairs' or dataset == 'celeba': idx = random.sample(range(len(training_images_list)), 2*5) validate_batch_files = [training_images_list[i] for i in idx] [images1, images2] = load_image(dir_name = img_dir, img_names = validate_batch_files) elif dataset == 'dsprites': [images1, images2] = manager.get_next() [reconst1, reconst2, _, _] = get_reconst_img(images1, images2) if dataset == 'chairs': for img_idx in range(len(images1)): sub_dir = "iter_{}_{}".format(epoch_idx, batch_idx) save_img = np.squeeze(images1[img_idx]) save_img = Image.fromarray(save_img) img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) save_img = np.squeeze(reconst1[img_idx]) save_img = Image.fromarray(save_img) img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) elif dataset == 'celeba': for img_idx in range(len(images1)): sub_dir = "iter_{}_{}".format(epoch_idx, batch_idx) save_img = np.squeeze(images1[img_idx]) save_img = Image.fromarray(save_img, 'RGB') img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) save_img = np.squeeze(reconst1[img_idx]) save_img = Image.fromarray(save_img, 'RGB') img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0]) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) elif dataset == 'dsprites': for img_idx in range(len(images1)): sub_dir = "iter_{}_{}".format(epoch_idx, batch_idx) # save_img = images1[img_idx].reshape(64, 64) save_img = np.squeeze(images1[img_idx]) save_img = save_img.astype(np.float32) img_file_name = "{}_ori.jpg".format(img_idx) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) # save_img = reconst1[img_idx].reshape(64, 64) save_img = np.squeeze(reconst1[img_idx]) save_img = save_img.astype(np.float32) img_file_name = "{}_rec.jpg".format(img_idx) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) if batch_idx % loss_check_n == 1: train_writer.add_summary(summary, batch_idx) t_epoch_end = time.time() t_epoch_run = t_epoch_end - t_epoch_start if dataset == 'dsprites': t_check = manager.sample_size / t_epoch_run warn("==========================================") warn("Run {} th epoch in {} sec: {} images / sec".format(epoch_idx+1, t_epoch_run, t_check)) warn("==========================================") if meta_saved == True: saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = False) else: print "Save meta graph" saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = True) meta_saved = True
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_param_noise_threshold_ph = tf.placeholder( tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder( tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable( "param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable( "param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") # We have to wrap this code into a function due to the way tf.cond() works. See # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for # a more detailed discussion. def perturb_vars(original_scope, perturbed_scope): all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars( absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. op = tf.assign( perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. op = tf.assign(perturbed_var, var) perturb_ops.append(op) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) mean_kl = tf.reduce_mean(kl) def update_scale(): with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond( mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_threshold_expr = param_noise_threshold.assign( tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) updates = [ update_eps_expr, tf.cond( reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_threshold_expr, ] _act = U.function(inputs=[ observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph ], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False }, updates=updates) def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) return act
def load_policy(filename): ''' loading and building expert policy ''' print('################ Env: ', filename, '###################') with open(filename, 'rb') as f: data = pickle.loads(f.read()) #print(type(data)) #<class 'dict'> # assert len(data.keys()) == 2 nonlin_type = data['nonlin_type'] #print(nonlin_type) # tanh policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] #print(policy_type) # GaussianPolicy assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) #assert type(t) is int, '정수 아닌 값이 있네' policy_params = data[policy_type] assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} ''' 집합 자료형 >>> s1 = set([1,2,3]) >>> s1 {1, 2, 3} ''' # Keep track of input and output dims (i.e. observation and action dims) for the user # Encapsulation def build_policy(obs_bo): def read_layer(layer_data): ''' Extract Weight, bias from layer <class 'dict'> ''' assert list(layer_data.keys()) == ['AffineLayer'] assert sorted(layer_data['AffineLayer'].keys()) == ['W', 'b'] return layer_data['AffineLayer']['W'].astype(np.float32), layer_data['AffineLayer']['b'].astype(np.float32) ''' numpy.ndarray.astype Copy of the array, cast to a specified type. ''' def apply_nonlin(x): ''' Apply the nonlinear activation function such as leack relu, tanh ''' if nonlin_type == 'lrelu': return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 elif nonlin_type == 'tanh': return tf.tanh(x) else: raise NotImplementedError(nonlin_type) # Build the policy. First, observation normalization. assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] # <class 'numpy.ndarray'> obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] # <class 'numpy.ndarray'> obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) # <class 'numpy.ndarray'>, standard deviation = \sqrt{E( X^2 ) - ( E(X) )^2} print('observation mean, standard deviation shape: ', obsnorm_mean.shape, obsnorm_stdev.shape) #(1, 11) normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation ''' Nomalized observation _ behavior observation (Standard score \frac{X-\mu}{\sigma}) obs_bo = X obsnorm_mean = /mu obsnorm_stdev = \sigma ---------------------------- normedobs_bo = normalized data ''' curr_activations_bd = normedobs_bo # Hidden layers next assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] layer_params = policy_params['hidden']['FeedforwardNet'] # < class 'dict'>, layer_0, layer_2 # 2 layers for layer_name in sorted(layer_params.keys()): # <class 'str'>, , layer_name = layer_0, layer_2 ''' Pass the layers given from expert, ''' layer_data = layer_params[layer_name] # < class 'dict'>, layer_data = {'W', 'b'} W, b = read_layer(layer_data) # layer_0: (11, 64) (1, 64), layer_2: (64, 64) (1, 64) print(W.shape, b.shape) curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) # current activation behavior data + nonlinear activation funtion print('----end---') # Output layer, 1 layer W, b = read_layer(policy_params['out']) # (64, 3) (1, 3) # print(W.shape, b.shape) output_bo = tf.matmul(curr_activations_bd, W) + b # (?, 3), ?은 위의 과정에서 (1, 11)과 브로팅캐스팅 진행 #print(output_bo.shape) return output_bo # Output behavior output #we create pairs of <observation, action> obs_bo = tf.placeholder(tf.float32, [None, None]) # <class 'tensorflow.python.framework.ops.Tensor'>, Tensor("Placeholder:0", shape=(?, ?), dtype=float32) a_ba = build_policy(obs_bo) # Output behavior, <class 'tensorflow.python.framework.ops.Tensor'>, Tensor("add_2:0", shape=(?, 3), dtype=float32) policy_fn = tf_util.function([obs_bo], a_ba) # <class 'function'>, ''' function(inputs, outputs, updates=None, givens=None) [obs_bo]: list, [<tf.Tensor 'Placeholder:0' shape=(?, ?) dtype=float32>] a_ba: <class 'tensorflow.python.framework.ops.Tensor'>, Tensor("add_2:0", shape=(?, 3), dtype=float32) ''' return policy_fn
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def train_net(model, manager, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_iter = 6000001, check_every_n = 1000, loss_check_n = 10, save_model_freq = 5000, batch_size = 32): img1 = U.get_placeholder_cached(name="img1") img2 = U.get_placeholder_cached(name="img2") # Testing # img_test = U.get_placeholder_cached(name="img_test") # reconst_tp = U.get_placeholder_cached(name="reconst_tp") vae_loss = U.mean(model.vaeloss) latent_z1_tp = model.latent_z1 latent_z2_tp = model.latent_z2 losses = [U.mean(model.vaeloss), U.mean(model.siam_loss), U.mean(model.kl_loss1), U.mean(model.kl_loss2), U.mean(model.reconst_error1), U.mean(model.reconst_error2), ] siam_normal = losses[1]/entangled_feat siam_max = U.mean(model.max_siam_loss) tf.summary.scalar('Total Loss', losses[0]) tf.summary.scalar('Siam Loss', losses[1]) tf.summary.scalar('kl1_loss', losses[2]) tf.summary.scalar('kl2_loss', losses[3]) tf.summary.scalar('reconst_err1', losses[4]) tf.summary.scalar('reconst_err2', losses[5]) tf.summary.scalar('Siam Normal', siam_normal) tf.summary.scalar('Siam Max', siam_max) # decoded_img = [model.reconst1, model.reconst2] compute_losses = U.function([img1, img2], vae_loss) lr = 0.005 optimizer=tf.train.AdagradOptimizer(learning_rate=lr) all_var_list = model.get_trainable_variables() # print all_var_list img1_var_list = all_var_list #[v for v in all_var_list if v.name.split("/")[1].startswith("proj1") or v.name.split("/")[1].startswith("unproj1")] optimize_expr1 = optimizer.minimize(vae_loss, var_list=img1_var_list) merged = tf.summary.merge_all() train = U.function([img1, img2], [losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1]) get_reconst_img = U.function([img1, img2], [model.reconst1_mean, model.reconst2_mean, latent_z1_tp, latent_z2_tp]) get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp]) # testing # test = U.function([img_test], model.latent_z_test) # test_reconst = U.function([reconst_tp], [model.reconst_test]) cur_dir = get_cur_dir() chk_save_dir = os.path.join(cur_dir, chkfile_name) log_save_dir = os.path.join(cur_dir, logfile_name) validate_img_saver_dir = os.path.join(cur_dir, validatefile_name) # test_img_saver_dir = os.path.join(cur_dir, "test_images") # testing_img_dir = os.path.join(cur_dir, "dataset/test_img") train_writer = U.summary_writer(dir = log_save_dir) U.initialize() saver, chk_file_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir) validate_img_saver = BW_Img_Saver(validate_img_saver_dir) # testing # test_img_saver = Img_Saver(test_img_saver_dir) meta_saved = False iter_log = [] loss1_log = [] loss2_log = [] loss3_log = [] training_images_list = manager.imgs # read_dataset(img_dir) n_total_train_data = len(training_images_list) # testing_images_list = read_dataset(testing_img_dir) # n_total_testing_data = len(testing_images_list) training = True testing = False if training == True: for num_iter in range(chk_file_num+1, max_iter): header("******* {}th iter: *******".format(num_iter)) idx = random.sample(range(n_total_train_data), 2*batch_size) batch_files = idx # print batch_files [images1, images2] = manager.get_images(indices = idx) img1, img2 = images1, images2 [l1, l2, _, _] = get_reconst_img(img1, img2) [loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2) warn("Total Loss: {}".format(loss0)) warn("Siam loss: {}".format(loss1)) warn("kl1_loss: {}".format(loss2)) warn("kl2_loss: {}".format(loss3)) warn("reconst_err1: {}".format(loss4)) warn("reconst_err2: {}".format(loss5)) # warn("num_iter: {} check: {}".format(num_iter, check_every_n)) # warn("Total Loss: {}".format(loss6)) if num_iter % check_every_n == 1: header("******* {}th iter: *******".format(num_iter)) idx = random.sample(range(len(training_images_list)), 2*5) [images1, images2] = manager.get_images(indices = idx) [reconst1, reconst2, _, _] = get_reconst_img(images1, images2) # for i in range(len(latent1[0])): # print "{} th: {:.2f}".format(i, np.mean(np.abs(latent1[:, i] - latent2[:, i]))) for img_idx in range(len(images1)): sub_dir = "iter_{}".format(num_iter) save_img = images1[img_idx].reshape(64, 64) save_img = save_img.astype(np.float32) img_file_name = "{}_ori.jpg".format(img_idx) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) save_img = reconst1[img_idx].reshape(64, 64) save_img = save_img.astype(np.float32) img_file_name = "{}_rec.jpg".format(img_idx) validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir) if num_iter % loss_check_n == 1: train_writer.add_summary(summary, num_iter) if num_iter > 11 and num_iter % save_model_freq == 1: if meta_saved == True: saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = False) else: print "Save meta graph" saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = True) meta_saved = True