def pd(self, obs): mb_input = obs name = self.name activ = self.activ nh = self.nh with tf.variable_scope(name): pi_1 = activ(fc(mb_input, scope='pi_1', nh=nh, init_scale=np.sqrt(2))) pi_2 = activ(fc(pi_1, scope='pi_2', nh=nh, init_scale=np.sqrt(2))) pd, _ = self.pdtype.pdfromlatent(pi_2) return pd
def __init__(self, obs, pi, trajectories, pdtype, name='Model_Based', nh=64, activ=tf.nn.tanh): self.pdtype = pdtype mb_input = tf.concat([obs, pi, trajectories], axis=-1) with tf.variable_scope(name): pi_1 = activ(fc(mb_input, scope='pi_1', nh=nh, init_scale=np.sqrt(2))) pi_2 = activ(fc(pi_1, scope='pi_2', nh=nh, init_scale=np.sqrt(2))) vf_1 = activ(fc(mb_input, scope='vf_1', nh=nh,init_scale=np.sqrt(2))) vf_2 = activ(fc(vf_1, scope='vf_2', nh=1, init_scale=np.sqrt(2))) self.pd, self.pi = self.pdtype.pdfromlatent(pi_2) self.act = self.pd.sample() self.vf = vf_2
def nature_cnn(unscaled_images, **conv_kwargs): """ CNN from Nature paper. """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, obs, nactions, actions, nobs, rewards, policy, trajectory_length=8, name='Env_Model', LR=tf.constant(1e-4), nh=64, nout=64, vcoef=0.5, activ = tf.nn.tanh, max_grad=0.5): all_trajectories = [] all_rewards = [] # rollout graph for action in range(nactions): action_list = [action] rollout_obs = [obs] rollout_rews = [] for t in range(trajectory_length): x_in = tf.concat(rollout_obs[t], tf.one_hot(action_list[t], nactions)) with tf.variable_scope(name): ns_1 = activ(fc(x_in, 'ns_1', nh, init_scale=np.sqrt(2))) ns_2 = tf.nn.sigmoid(fc(ns_1, 'ns_2', nout, init_scale=np.sqrt(2))) vf_1 = activ(fc(x_in, 'vf_1', nh, init_scale=np.sqrt(2))) vf_2 = activ(fc(vf_1, 'vf_1', 1, init_scale=np.sqrt(2))) rollout_obs.append(ns_2) rollout_rews.append(vf_2) action = self.pdtype.pdfromlatent(rollout_obs[t+1]).sample() action_list.append(action) all_trajectories.append(tf.stack(rollout_obs[1:])) all_rewards.append(tf.stack(rollout_rews)) # training graph with tf.variable_scope(name): X_IN = tf.concat(obs, tf.one_hot(actions, nactions)) ns_1 = activ(fc(X_IN, 'ns_1', nh, init_scale=np.sqrt(2))) ns_2 = tf.nn.sigmoid(fc(ns_1, 'ns_2', nout, init_scale=np.sqrt(2))) vf_1 = activ(fc(X_IN, 'vf_1', nh, init_scale=np.sqrt(2))) vf_2 = activ(fc(vf_1, 'vf_1', 1, init_scale=np.sqrt(2))) prediction_loss = tf.mean(tf.sum(tf.square(ns_2 - nobs), axis=-1)) value_loss = tf.mean(tf.sum(tf.square(vf_2 - rewards), axis=-1)) env_loss = prediction_loss + vcoef * value_loss optimizer = tf.train.AdamOptimizer(LR) params = tf.trainable_variables() grads = tf.gradients(env_loss, params) if max_grad is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad) grads = list(zip(grads, params)) self.trainer = optimizer.apply_gradients(grads)
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias, r=self.r) logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.constant_initializer(-1.)) pdparam = tf.concat([mean, tf.zeros_like(mean) + logstd], axis=-1) return self.pdfromflat(pdparam), mean
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias, r=self.r) return self.pdfromflat(pdparam), pdparam
def __call__(self, tin): embed = tf.layers.batch_normalization(tin, reuse=tf.AUTO_REUSE, **self.batch_norm_kwargs) for i in range(self.layers): embed = self.activ(fc(embed, 'em_fc'+str(i), nh=self.nh, init_scale=np.sqrt(2), r=self.r)) tout = embed return tout
def __call__(self, tin): embed = nature_cnn(tin, **self.conv_kwargs) for i in range(self.layers): embed = self.activ(fc(embed, 'em_fc'+str(i), nh=self.nh, init_scale=np.sqrt(2))) tout = embed return tout
def __call__(self, tin): embed = tin for i in range(self.layers): embed = self.activ(fc(embed, 'em_fc'+str(i), nh=self.nh, init_scale=np.sqrt(2), r=self.r)) tout=embed return tout
def __init__(self, mgoal, state, pstate, pdtype=None, nhist=4, nin=32, ngoal=16, nembed=8, manager=False, nh=64, activ=tf.nn.relu, name=1, nbatch=1, neplength=1e2, cell=tf.contrib.rnn.LSTMCell, val=False, recurrent=1): self.mgoal = mgoal[:, :, :nin] self.state = state self.pstate = pstate #state = tf.concat([self.mgoal, self.state], axis=-1) nph = nh self.manager = manager self.name = name nout = ngoal if manager else nh self.pdtype = pdtype with tf.variable_scope("level" + str(self.name)): em_h2 = activ( fc(state, 'em_fc2', nh=nout, init_scale=np.sqrt(2), r=True)) embed_goal = activ( fc(self.mgoal, 'embed', nh=nph, init_scale=np.sqrt(2), r=True)) cell = cell(nh, state_is_tuple=False) a_h1, nstate = tf.nn.dynamic_rnn(cell, inputs=state, initial_state=pstate[:, 0, :]) c_h1 = activ(a_h1) pi_h2 = activ( fc(c_h1, 'pi_fc2', nh=nph, init_scale=np.sqrt(2), r=True)) vf_h2 = activ( fc(c_h1, 'vf_fc2', nh=nh, init_scale=np.sqrt(2), r=True)) vout = tf.nn.tanh(fc(vf_h2, 'vf', 1, r=True))[:, :, 0] pout = embed_goal + pi_h2 #pout = pi_h2 self.pd, self.pi = self.pdtype.pdfromlatent(pout, init_scale=0.01) aout = self.pd.sample() neglogpout = self.pd.neglogp(aout) self.nstate = nstate self.aout = aout self.nlp = neglogpout def bcs(state, spad, gpad, nhist): rew = tf.zeros(shape=(nbatch, neplength), dtype=tf.float32) for t in range(nhist): svec = state - spad[:, nhist - t - 1:-(t + 1), :] gvec = gpad[:, nhist - t - 1:-(t + 1), :] nsv = tf.nn.l2_normalize(svec, axis=-1) ngv = tf.nn.l2_normalize(gvec, axis=-1) rew += tf.reduce_sum(tf.multiply(nsv, ngv), axis=-1) return rew def fcs(fvec, gvec, nhist): nfv = tf.nn.l2_normalize(fvec, axis=-1) ngv = tf.nn.l2_normalize(gvec, axis=-1) sim = tf.reduce_sum(tf.multiply(nfv, ngv), axis=-1) return sim self.vf = vout if self.manager: pad = tf.constant([[0, 0], [nhist, 0], [0, 0]]) spad = tf.pad(em_h2, pad, "CONSTANT") gpad = tf.pad(aout, pad, "CONSTANT") self.inr = 1 / nhist * tf.stop_gradient( bcs(em_h2, spad, gpad, nhist)) lstate = em_h2[:, -1, :] rep = tf.reshape(tf.tile(lstate, [nhist, 1]), (nbatch, nhist, nout)) spadf = tf.concat([em_h2, rep], axis=1) self.fvec = spadf[:, nhist:, ] - em_h2 self.traj_sim = fcs(self.fvec, aout, nhist)
def __init__(self, mgoal, state, pstate, pdtype=None, nhist=4, nin=32, ngoal=16, recurrent=0, nembed=8, manager=False, nh=64, activ=tf.nn.relu, name=1, nbatch=1e3, val=True, feed_fvec=None): ''' INPUTS: mgoal - goal tensor of supervisor state - observation tensor post-embedding pstate - recurrent state tensor, ignored in this call mfvec - ''' self.mgoal = mgoal[:, :nin] self.state = state #state = tf.concat([self.mgoal, self.state], axis=-1) nph = nh self.manager = manager self.name = name self.initial_state = None nout = ngoal if manager else nh self.nout = nout self.pdtype = pdtype with tf.variable_scope("level" + str(self.name)): em_h2 = fc(state, 'em_fc2', nh=nout, init_scale=np.sqrt(2)) embed_goal = fc(self.mgoal, 'embed', nh=nh, init_scale=np.sqrt(2)) pi_h1 = activ(fc(em_h2, 'pi_fc1', nh=nh, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=nh, init_scale=np.sqrt(2))) vf_h1 = activ(fc(state, 'vf_fc1', nh=nh, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=nh, init_scale=np.sqrt(2))) pout = embed_goal * pi_h2 vout = tf.nn.tanh(fc(vf_h2, 'vf', 1))[:, 0] #pout = pi_h2 self.pd, self.pi = self.pdtype.pdfromlatent(pout, init_scale=0.01) aout = self.pd.sample() neglogpout = self.pd.neglogp(aout) self.aout = aout self.nlp = neglogpout #print(self.nlp) self.nstate = None def bcs(state, spad, gpad, nhist): rew = tf.fill([nbatch], 0.0) for t in range(nhist): svec = state - spad[nhist - t - 1:-(t + 1), :] gvec = gpad[nhist - t - 1:-(t + 1), :] nsv = tf.nn.l2_normalize(svec, axis=-1) ngv = tf.nn.l2_normalize(gvec, axis=-1) rew += tf.reduce_sum(tf.multiply(nsv, ngv), axis=-1) return rew def sparse_bcs(state, spad, gpad, nhist, axis=0): rew = tf.fill([nbatch], 0.0) for t in range(nhist): if axis == 1: svec = state - spad[:, nhist - t - 1:-(t + 1), :] gvec = gpad[:, nhist - t - 1:-(t + 1), :] else: svec = state - spad[nhist - t - 1:-(t + 1), :] gvec = gpad[nhist - t - 1:-(t + 1), :] delta_gs = tf.to_float( tf.equal( tf.reduce_mean(tf.to_float(tf.equal(svec, gvec)), axis=-1), 1.)) zero_mask = tf.to_float( tf.equal( tf.reduce_mean(tf.to_float( tf.equal(tf.zeros_like(gvec), gvec)), axis=-1), 1.)) delta_gs *= (1. - zero_mask) #rew = tf.Print(rew, [delta_gs, tf.shape(delta_gs)]) rew += delta_gs #rew += tf.to_float(tf.equal(tf.reduce_mean(tf.to_float(tf.equal(svec, gvec)), axis=-1), 1.)) #print("sparse_bcs shape: {}".format(rew.get_shape())) return rew def fcs(fvec, gvec, nhist): nfv = tf.nn.l2_normalize(fvec, axis=-1) ngv = tf.nn.l2_normalize(gvec, axis=-1) sim = tf.reduce_sum(tf.multiply(nfv, ngv), axis=-1) return sim self.vf = vout if self.manager: pad = tf.constant([[nhist, 0], [0, 0]]) spad = tf.pad(em_h2, pad, "CONSTANT") gpad = tf.pad(aout, pad, "CONSTANT") self.inr = 1 / nhist * tf.stop_gradient( bcs(em_h2, spad, gpad, nhist)) lstate = em_h2[-1, :] rep = tf.reshape(tf.tile(lstate, tf.constant([nhist])), (nhist, nout)) spadf = tf.concat([em_h2, rep], axis=0) self.fvec = spadf[nhist:, ] - em_h2 self.train_nlp = self.pd.neglogp( tf.nn.l2_normalize(tf.stop_gradient(self.fvec), axis=-1)) self.loss_nlp = self.pd.neglogp( tf.nn.l2_normalize(tf.stop_gradient(feed_fvec), axis=-1)) self.traj_sim = fcs(self.fvec, aout, nhist)