def factorReshape(Q, e, grad, facIndx=0, ftype='act'): grad_shape = grad.get_shape() if ftype == 'act': assert e.get_shape()[0] == grad_shape[facIndx] expanded_shape = [1, ] * len(grad_shape) expanded_shape[facIndx] = -1 e = tf.reshape(e, expanded_shape) if ftype == 'grad': assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1] expanded_shape = [1, ] * len(grad_shape) expanded_shape[len(grad_shape) - facIndx - 1] = -1 e = tf.reshape(e, expanded_shape) return Q, e
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): assert reduce_dim is not None # weird batch matmul if len(a.get_shape()) == 2 and len(b.get_shape()) > 2: # reshape reduce_dim to the left most dim in b b_shape = b.get_shape() if reduce_dim != 0: b_dims = list(range(len(b_shape))) b_dims.remove(reduce_dim) b_dims.insert(0, reduce_dim) b = tf.transpose(b, b_dims) b_t_shape = b.get_shape() b = tf.reshape(b, [int(b_shape[reduce_dim]), -1]) result = tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) result = tf.reshape(result, b_t_shape) if reduce_dim != 0: b_dims = list(range(len(b_shape))) b_dims.remove(0) b_dims.insert(reduce_dim, 0) result = tf.transpose(result, b_dims) return result elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2: # reshape reduce_dim to the right most dim in a a_shape = a.get_shape() outter_dim = len(a_shape) - 1 reduce_dim = len(a_shape) - reduce_dim - 1 if reduce_dim != outter_dim: a_dims = list(range(len(a_shape))) a_dims.remove(reduce_dim) a_dims.insert(outter_dim, reduce_dim) a = tf.transpose(a, a_dims) a_t_shape = a.get_shape() a = tf.reshape(a, [-1, int(a_shape[reduce_dim])]) result = tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) result = tf.reshape(result, a_t_shape) if reduce_dim != outter_dim: a_dims = list(range(len(a_shape))) a_dims.remove(outter_dim) a_dims.insert(reduce_dim, outter_dim) result = tf.transpose(result, a_dims) return result elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2: return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) assert False, 'something went wrong'
def reshape_for_broadcasting(source, target): """Reshapes a tensor (source) to have the correct shape and dtype of the target before broadcasting it with MPI. """ dim = len(target.get_shape()) shape = ([1] * (dim - 1)) + [-1] return tf.reshape(tf.cast(source, target.dtype), shape)
def compute_gradients(self, loss, var_list, **kwargs): grads_and_vars = tf.train.AdamOptimizer.compute_gradients( self, loss, var_list, **kwargs) grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] flat_grad = tf.concat( [tf.reshape(g, (-1, )) for g, v in grads_and_vars], axis=0) shapes = [v.shape.as_list() for g, v in grads_and_vars] sizes = [int(np.prod(s)) for s in shapes] num_tasks = self.comm.Get_size() buf = np.zeros(sum(sizes), np.float32) def _collect_grads(flat_grad): self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) np.divide(buf, float(num_tasks), out=buf) return buf avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) avg_flat_grad.set_shape(flat_grad.shape) avg_grads = tf.split(avg_flat_grad, sizes, axis=0) avg_grads_and_vars = [(tf.reshape(g, v.shape), v) for g, (_, v) in zip(avg_grads, grads_and_vars)] return avg_grads_and_vars
def nn(input, layers_sizes, reuse=None, flatten=False, name=""): """Creates a simple neural network """ for i, size in enumerate(layers_sizes): activation = tf.nn.relu if i < len(layers_sizes) - 1 else None input = tf.layers.dense( inputs=input, units=size, kernel_initializer=tf.contrib.layers.xavier_initializer(), reuse=reuse, name=name + '_' + str(i)) if activation: input = activation(input) if flatten: assert layers_sizes[-1] == 1 input = tf.reshape(input, [-1]) return input
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def getKfacPrecondUpdates(self, gradlist, varlist): updatelist = [] vg = 0. assert len(self.stats) > 0 assert len(self.stats_eigen) > 0 assert len(self.factors) > 0 counter = 0 grad_dict = {var: grad for grad, var in zip(gradlist, varlist)} for grad, var in zip(gradlist, varlist): GRAD_RESHAPE = False GRAD_TRANSPOSE = False fpropFactoredFishers = self.stats[var]['fprop_concat_stats'] bpropFactoredFishers = self.stats[var]['bprop_concat_stats'] if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0: counter += 1 GRAD_SHAPE = grad.get_shape() if len(grad.get_shape()) > 2: # reshape conv kernel parameters KW = int(grad.get_shape()[0]) KH = int(grad.get_shape()[1]) C = int(grad.get_shape()[2]) D = int(grad.get_shape()[3]) if len(fpropFactoredFishers) > 1 and self._channel_fac: # reshape conv kernel parameters into tensor grad = tf.reshape(grad, [KW * KH, C, D]) else: # reshape conv kernel parameters into 2D grad grad = tf.reshape(grad, [-1, D]) GRAD_RESHAPE = True elif len(grad.get_shape()) == 1: # reshape bias or 1D parameters D = int(grad.get_shape()[0]) grad = tf.expand_dims(grad, 0) GRAD_RESHAPE = True else: # 2D parameters C = int(grad.get_shape()[0]) D = int(grad.get_shape()[1]) if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: # use homogeneous coordinates only works for 2D grad. # TO-DO: figure out how to factorize bias grad # stack bias grad var_assnBias = self.stats[var]['assnBias'] grad = tf.concat( [grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0) # project gradient to eigen space and reshape the eigenvalues # for broadcasting eigVals = [] for idx, stats in enumerate( self.stats[var]['fprop_concat_stats']): Q = self.stats_eigen[stats]['Q'] e = detectMinVal(self.stats_eigen[stats]['e'], var, name='act', debug=KFAC_DEBUG) Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act') eigVals.append(e) grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx) for idx, stats in enumerate( self.stats[var]['bprop_concat_stats']): Q = self.stats_eigen[stats]['Q'] e = detectMinVal(self.stats_eigen[stats]['e'], var, name='grad', debug=KFAC_DEBUG) Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad') eigVals.append(e) grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx) ## ##### # whiten using eigenvalues weightDecayCoeff = 0. if var in self._weight_decay_dict: weightDecayCoeff = self._weight_decay_dict[var] if KFAC_DEBUG: print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff))) if self._factored_damping: if KFAC_DEBUG: print(('use factored damping for %s' % (var.name))) coeffs = 1. num_factors = len(eigVals) # compute the ratio of two trace norm of the left and right # KFac matrices, and their generalization if len(eigVals) == 1: damping = self._epsilon + weightDecayCoeff else: damping = tf.pow(self._epsilon + weightDecayCoeff, 1. / num_factors) eigVals_tnorm_avg = [ tf.reduce_mean(tf.abs(e)) for e in eigVals ] for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg): eig_tnorm_negList = [ item for item in eigVals_tnorm_avg if item != e_tnorm ] if len(eigVals) == 1: adjustment = 1. elif len(eigVals) == 2: adjustment = tf.sqrt(e_tnorm / eig_tnorm_negList[0]) else: eig_tnorm_negList_prod = reduce( lambda x, y: x * y, eig_tnorm_negList) adjustment = tf.pow( tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors) coeffs *= (e + adjustment * damping) else: coeffs = 1. damping = (self._epsilon + weightDecayCoeff) for e in eigVals: coeffs *= e coeffs += damping #grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()]) grad /= coeffs #grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()]) ##### # project gradient back to euclidean space for idx, stats in enumerate( self.stats[var]['fprop_concat_stats']): Q = self.stats_eigen[stats]['Q'] grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx) for idx, stats in enumerate( self.stats[var]['bprop_concat_stats']): Q = self.stats_eigen[stats]['Q'] grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx) ## #grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()]) if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: # use homogeneous coordinates only works for 2D grad. # TO-DO: figure out how to factorize bias grad # un-stack bias grad var_assnBias = self.stats[var]['assnBias'] C_plus_one = int(grad.get_shape()[0]) grad_assnBias = tf.reshape( tf.slice(grad, begin=[C_plus_one - 1, 0], size=[1, -1]), var_assnBias.get_shape()) grad_assnWeights = tf.slice(grad, begin=[0, 0], size=[C_plus_one - 1, -1]) grad_dict[var_assnBias] = grad_assnBias grad = grad_assnWeights #grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()]) if GRAD_RESHAPE: grad = tf.reshape(grad, GRAD_SHAPE) grad_dict[var] = grad print(('projecting %d gradient matrices' % counter)) for g, var in zip(gradlist, varlist): grad = grad_dict[var] ### clipping ### if KFAC_DEBUG: print(('apply clipping to %s' % (var.name))) tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad") local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr)) vg += local_vg # recale everything if KFAC_DEBUG: print('apply vFv clipping') scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg)) if KFAC_DEBUG: scaling = tf.Print(scaling, [ tf.convert_to_tensor('clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg ]) with tf.control_dependencies([tf.assign(self.vFv, vg)]): updatelist = [grad_dict[var] for var in varlist] for i, item in enumerate(updatelist): updatelist[i] = scaling * item return updatelist
def compute_stats(self, loss_sampled, var_list=None): varlist = var_list if varlist is None: varlist = tf.trainable_variables() gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled') self.gs = gs factors = self.getFactors(gs, varlist) stats = self.getStats(factors, varlist) updateOps = [] statsUpdates = {} statsUpdates_cache = {} for var in varlist: opType = factors[var]['opName'] fops = factors[var]['op'] fpropFactor = factors[var]['fpropFactors_concat'] fpropStats_vars = stats[var]['fprop_concat_stats'] bpropFactor = factors[var]['bpropFactors_concat'] bpropStats_vars = stats[var]['bprop_concat_stats'] SVD_factors = {} for stats_var in fpropStats_vars: stats_var_dim = int(stats_var.get_shape()[0]) if stats_var not in statsUpdates_cache: old_fpropFactor = fpropFactor B = (tf.shape(fpropFactor)[0]) # batch size if opType == 'Conv2D': strides = fops.get_attr("strides") padding = fops.get_attr("padding") convkernel_size = var.get_shape()[0:3] KH = int(convkernel_size[0]) KW = int(convkernel_size[1]) C = int(convkernel_size[2]) flatten_size = int(KH * KW * C) Oh = int(bpropFactor.get_shape()[1]) Ow = int(bpropFactor.get_shape()[2]) if Oh == 1 and Ow == 1 and self._channel_fac: # factorization along the channels # assume independence among input channels # factor = B x 1 x 1 x (KH xKW x C) # patches = B x Oh x Ow x (KH xKW x C) if len(SVD_factors) == 0: if KFAC_DEBUG: print(( 'approx %s act factor with rank-1 SVD factors' % (var.name))) # find closest rank-1 approx to the feature map S, U, V = tf.batch_svd( tf.reshape(fpropFactor, [-1, KH * KW, C])) # get rank-1 approx slides sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1) patches_k = U[:, :, 0] * sqrtS1 # B x KH*KW full_factor_shape = fpropFactor.get_shape() patches_k.set_shape( [full_factor_shape[0], KH * KW]) patches_c = V[:, :, 0] * sqrtS1 # B x C patches_c.set_shape([full_factor_shape[0], C]) SVD_factors[C] = patches_c SVD_factors[KH * KW] = patches_k fpropFactor = SVD_factors[stats_var_dim] else: # poor mem usage implementation patches = tf.extract_image_patches( fpropFactor, ksizes=[ 1, convkernel_size[0], convkernel_size[1], 1 ], strides=strides, rates=[1, 1, 1, 1], padding=padding) if self._approxT2: if KFAC_DEBUG: print(('approxT2 act fisher for %s' % (var.name))) # T^2 terms * 1/T^2, size: B x C fpropFactor = tf.reduce_mean(patches, [1, 2]) else: # size: (B x Oh x Ow) x C fpropFactor = tf.reshape( patches, [-1, flatten_size]) / Oh / Ow fpropFactor_size = int(fpropFactor.get_shape()[-1]) if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias: if opType == 'Conv2D' and not self._approxT2: # correct padding for numerical stability (we # divided out OhxOw from activations for T1 approx) fpropFactor = tf.concat([ fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1]) / Oh / Ow ], 1) else: # use homogeneous coordinates fpropFactor = tf.concat([ fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1]) ], 1) # average over the number of data points in a batch # divided by B cov = tf.matmul(fpropFactor, fpropFactor, transpose_a=True) / tf.cast(B, tf.float32) updateOps.append(cov) statsUpdates[stats_var] = cov if opType != 'Conv2D': # HACK: for convolution we recompute fprop stats for # every layer including forking layers statsUpdates_cache[stats_var] = cov for stats_var in bpropStats_vars: stats_var_dim = int(stats_var.get_shape()[0]) if stats_var not in statsUpdates_cache: old_bpropFactor = bpropFactor bpropFactor_shape = bpropFactor.get_shape() B = tf.shape(bpropFactor)[0] # batch size C = int(bpropFactor_shape[-1]) # num channels if opType == 'Conv2D' or len(bpropFactor_shape) == 4: if fpropFactor is not None: if self._approxT2: if KFAC_DEBUG: print(('approxT2 grad fisher for %s' % (var.name))) bpropFactor = tf.reduce_sum( bpropFactor, [1, 2]) # T^2 terms * 1/T^2 else: bpropFactor = tf.reshape( bpropFactor, [-1, C]) * Oh * Ow # T * 1/T terms else: # just doing block diag approx. spatial independent # structure does not apply here. summing over # spatial locations if KFAC_DEBUG: print(('block diag approx fisher for %s' % (var.name))) bpropFactor = tf.reduce_sum(bpropFactor, [1, 2]) # assume sampled loss is averaged. TO-DO:figure out better # way to handle this bpropFactor *= tf.to_float(B) ## cov_b = tf.matmul(bpropFactor, bpropFactor, transpose_a=True) / tf.to_float( tf.shape(bpropFactor)[0]) updateOps.append(cov_b) statsUpdates[stats_var] = cov_b statsUpdates_cache[stats_var] = cov_b if KFAC_DEBUG: aKey = list(statsUpdates.keys())[0] statsUpdates[aKey] = tf.Print(statsUpdates[aKey], [ tf.convert_to_tensor('step:'), self.global_step, tf.convert_to_tensor('computing stats'), ]) self.statsUpdates = statsUpdates return statsUpdates
def multi_modal_network(dim_input=27, dim_output=7, batch_size=25, network_config=None): """ An example a network in tf that has both state and image inputs. Args: dim_input: Dimensionality of input. dim_output: Dimensionality of the output. batch_size: Batch size. network_config: dictionary of network structure parameters Returns: A tfMap object that stores inputs, outputs, and scalar loss. """ n_layers = 2 layer_size = 20 dim_hidden = (n_layers - 1)*[layer_size] dim_hidden.append(dim_output) pool_size = 2 filter_size = 3 # List of indices for state (vector) data and image (tensor) data in observation. x_idx, img_idx, i = [], [], 0 for sensor in network_config['obs_include']: dim = network_config['sensor_dims'][sensor] if sensor in network_config['obs_image_data']: img_idx = img_idx + list(range(i, i+dim)) else: x_idx = x_idx + list(range(i, i+dim)) i += dim nn_input, action, precision = get_input_layer(dim_input, dim_output) state_input = nn_input[:, 0:x_idx[-1]+1] image_input = nn_input[:, x_idx[-1]+1:img_idx[-1]+1] # image goes through 2 convnet layers num_filters = network_config['num_filters'] im_height = network_config['image_height'] im_width = network_config['image_width'] num_channels = network_config['image_channels'] image_input = tf.reshape(image_input, [-1, im_width, im_height, num_channels]) # we pool twice, each time reducing the image size by a factor of 2. conv_out_size = int(im_width/(2.0*pool_size)*im_height/(2.0*pool_size)*num_filters[1]) first_dense_size = conv_out_size + len(x_idx) # Store layers weight & bias weights = { 'wc1': get_xavier_weights([filter_size, filter_size, num_channels, num_filters[0]], (pool_size, pool_size)), # 5x5 conv, 1 input, 32 outputs 'wc2': get_xavier_weights([filter_size, filter_size, num_filters[0], num_filters[1]], (pool_size, pool_size)), # 5x5 conv, 32 inputs, 64 outputs } biases = { 'bc1': init_bias([num_filters[0]]), 'bc2': init_bias([num_filters[1]]), } conv_layer_0 = conv2d(img=image_input, w=weights['wc1'], b=biases['bc1']) conv_layer_0 = max_pool(conv_layer_0, k=pool_size) conv_layer_1 = conv2d(img=conv_layer_0, w=weights['wc2'], b=biases['bc2']) conv_layer_1 = max_pool(conv_layer_1, k=pool_size) conv_out_flat = tf.reshape(conv_layer_1, [-1, conv_out_size]) fc_input = tf.concat(concat_dim=1, values=[conv_out_flat, state_input]) fc_output, _, _ = get_mlp_layers(fc_input, n_layers, dim_hidden) loss = euclidean_loss_layer(a=action, b=fc_output, precision=precision, batch_size=batch_size) return TfMap.init_from_lists([nn_input, action, precision], [fc_output], [loss])
def multi_modal_network_fp(dim_input=27, dim_output=7, batch_size=25, network_config=None): """ An example a network in tf that has both state and image inputs, with the feature point architecture (spatial softmax + expectation). Args: dim_input: Dimensionality of input. dim_output: Dimensionality of the output. batch_size: Batch size. network_config: dictionary of network structure parameters Returns: A tfMap object that stores inputs, outputs, and scalar loss. """ n_layers = 3 layer_size = 20 dim_hidden = (n_layers - 1)*[layer_size] dim_hidden.append(dim_output) pool_size = 2 filter_size = 5 # List of indices for state (vector) data and image (tensor) data in observation. x_idx, img_idx, i = [], [], 0 for sensor in network_config['obs_include']: dim = network_config['sensor_dims'][sensor] if sensor in network_config['obs_image_data']: img_idx = img_idx + list(range(i, i+dim)) else: x_idx = x_idx + list(range(i, i+dim)) i += dim nn_input, action, precision = get_input_layer(dim_input, dim_output) state_input = nn_input[:, 0:x_idx[-1]+1] image_input = nn_input[:, x_idx[-1]+1:img_idx[-1]+1] # image goes through 3 convnet layers num_filters = network_config['num_filters'] im_height = network_config['image_height'] im_width = network_config['image_width'] num_channels = network_config['image_channels'] image_input = tf.reshape(image_input, [-1, num_channels, im_width, im_height]) image_input = tf.transpose(image_input, perm=[0,3,2,1]) # we pool twice, each time reducing the image size by a factor of 2. conv_out_size = int(im_width/(2.0*pool_size)*im_height/(2.0*pool_size)*num_filters[1]) first_dense_size = conv_out_size + len(x_idx) # Store layers weight & bias with tf.variable_scope('conv_params'): weights = { 'wc1': init_weights([filter_size, filter_size, num_channels, num_filters[0]], name='wc1'), # 5x5 conv, 1 input, 32 outputs 'wc2': init_weights([filter_size, filter_size, num_filters[0], num_filters[1]], name='wc2'), # 5x5 conv, 32 inputs, 64 outputs 'wc3': init_weights([filter_size, filter_size, num_filters[1], num_filters[2]], name='wc3'), # 5x5 conv, 32 inputs, 64 outputs } biases = { 'bc1': init_bias([num_filters[0]], name='bc1'), 'bc2': init_bias([num_filters[1]], name='bc2'), 'bc3': init_bias([num_filters[2]], name='bc3'), } conv_layer_0 = conv2d(img=image_input, w=weights['wc1'], b=biases['bc1'], strides=[1,2,2,1]) conv_layer_1 = conv2d(img=conv_layer_0, w=weights['wc2'], b=biases['bc2']) conv_layer_2 = conv2d(img=conv_layer_1, w=weights['wc3'], b=biases['bc3']) _, num_rows, num_cols, num_fp = conv_layer_2.get_shape() num_rows, num_cols, num_fp = [int(x) for x in [num_rows, num_cols, num_fp]] x_map = np.empty([num_rows, num_cols], np.float32) y_map = np.empty([num_rows, num_cols], np.float32) for i in range(num_rows): for j in range(num_cols): x_map[i, j] = (i - num_rows / 2.0) / num_rows y_map[i, j] = (j - num_cols / 2.0) / num_cols x_map = tf.convert_to_tensor(x_map) y_map = tf.convert_to_tensor(y_map) x_map = tf.reshape(x_map, [num_rows * num_cols]) y_map = tf.reshape(y_map, [num_rows * num_cols]) # rearrange features to be [batch_size, num_fp, num_rows, num_cols] features = tf.reshape(tf.transpose(conv_layer_2, [0,3,1,2]), [-1, num_rows*num_cols]) softmax = tf.nn.softmax(features) fp_x = tf.reduce_sum(tf.mul(x_map, softmax), [1], keep_dims=True) fp_y = tf.reduce_sum(tf.mul(y_map, softmax), [1], keep_dims=True) fp = tf.reshape(tf.concat(1, [fp_x, fp_y]), [-1, num_fp*2]) fc_input = tf.concat(concat_dim=1, values=[fp, state_input]) fc_output, weights_FC, biases_FC = get_mlp_layers(fc_input, n_layers, dim_hidden) fc_vars = weights_FC + biases_FC loss = euclidean_loss_layer(a=action, b=fc_output, precision=precision, batch_size=batch_size) nnet = TfMap.init_from_lists([nn_input, action, precision], [fc_output], [loss], fp=fp) last_conv_vars = fc_input return nnet, fc_vars, last_conv_vars
def flatten_grads(var_list, grads): """Flattens a variables and their gradients. """ return tf.concat( [tf.reshape(grad, [U.numel(v)]) for (v, grad) in zip(var_list, grads)], 0)
def learn(env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via tensorflow_code-pytorch.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi