def quantized_bprop(self, cost): """ bprop for convolution layer equals: ( self.x.dimshuffle(1, 0, 2, 3) (*) T.grad(cost, wrt=#convoutput).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1] ).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1] '(*)'stands for convolution. Here we quantize (rep of previous layer) and leave the rest as it is. """ # the lower 2**(integer power) index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)), T.floor(T.log2(-self.x))) index_low = T.clip(index_low, -4, 3) sign = T.switch(self.x > 0., 1., -1.) #index_up = index_low + 1 # the upper 2**(integer power) though not used explicitly. p_up = sign * self.x / 2**(index_low) - 1 # percentage of upper index. srng = theano.sandbox.rng_mrg.MRG_RandomStreams( self.rng.randint(999999)) index_random = index_low + srng.binomial( n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX) quantized_rep = sign * 2**index_random error = T.grad(cost=cost, wrt=self.conv_z) self.dEdW = T.nnet.conv.conv2d( input=quantized_rep.dimshuffle(1, 0, 2, 3), filters=error.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]).dimshuffle( 1, 0, 2, 3)[:, :, ::-1, ::-1] self.dEdb = T.grad(cost=cost, wrt=self.b) if self.BN == True: self.dEda = T.grad(cost=cost, wrt=self.a)
def get_entropy_reg(self): epsilon = 1e-7 p = self.activation_h(self.input) p = T.switch(T.eq(p, 0), epsilon, p) p = T.switch(T.eq(p, 1), 1-epsilon, p) entropy = -p*T.log2(p)-(1-p)*T.log2(1-p) return T.mean(entropy)
def quantized_bprop(self, cost): """ bprop equals: (active_prime) *elem_multiply* error_signal_in * (rep of previous layer) (rep of previous layer) is recoded as self.x during fprop() process. Here we quantize (rep of previous layer) and leave the rest as it is. """ # the lower 2**(integer power) index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)), T.floor(T.log2(-self.x))) sign = T.switch(self.x > 0., 1., -1.) # index_up = index_low + 1 # the upper 2**(integer power) though not used explicitly. p_up = sign * self.x / 2**(index_low) - 1 # percentage of upper index. srng = theano.sandbox.rng_mrg.MRG_RandomStreams( self.rng.randint(999999)) index_random = index_low + srng.binomial( n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX) quantized_rep = sign * 2**index_random # there is sth wrong with this self-made backprop: # the code is using BN, but this type of explicitly computation is not considering # gradients caused by BN. # error = self.activation_prime(self.z) * error_signal_in error = T.grad(cost=cost, wrt=self.z) self.dEdW = T.dot(quantized_rep.T, error) self.dEdb = T.grad(cost=cost, wrt=self.b) if self.BN == True: self.dEda = T.grad(cost=cost, wrt=self.a)
def get_output(self, input_, label, mask): """ This function overrides the parents' one. Computes the loss by mode input_ion and real label. Parameters ---------- input_: TensorVariable an array of (batch size, input_ion). for accuracy task, "input_" is 2D matrix. label: TensorVariable an array of (batch size, answer) or (batchsize,) if label is a list of class labels. for word perplexity case, currently only second one is supported. should make label as integer. mask: TensorVariable an array of (batchsize,) only contains 0 and 1. loss are summed or averaged only through 1. Returns ------- TensorVariable a symbolic tensor variable which is scalar. """ # do if mask is None: return T.pow( 2, -T.mean(T.log2(input_[T.arange(label.shape[0]), label]))) else: return T.pow( 2, -T.sum(T.log2(input_[T.arange(label.shape[0]), label]) * mask) / T.sum(mask))
def quantized_bprop(self, cost): """ bprop for convolution layer equals: ( self.x.dimshuffle(1, 0, 2, 3) (*) T.grad(cost, wrt=#convoutput).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1] ).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1] '(*)'stands for convolution. Here we quantize (rep of previous layer) and leave the rest as it is. """ # the lower 2**(integer power) index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)), T.floor(T.log2(-self.x))) index_low = T.clip(index_low, -4, 3) sign = T.switch(self.x > 0., 1., -1.) #index_up = index_low + 1 # the upper 2**(integer power) though not used explicitly. p_up = sign * self.x / 2**(index_low) - 1 # percentage of upper index. srng = theano.sandbox.rng_mrg.MRG_RandomStreams(self.rng.randint(999999)) index_random = index_low + srng.binomial(n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX) quantized_rep = sign * 2**index_random error = T.grad(cost=cost, wrt=self.conv_z) self.dEdW = T.nnet.conv.conv2d( input=quantized_rep.dimshuffle(1, 0, 2, 3), filters=error.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1] ).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1] self.dEdb = T.grad(cost=cost, wrt=self.b) if self.BN == True: self.dEda = T.grad(cost=cost, wrt=self.a)
def quantized_bprop(self, cost): """ bprop equals: (active_prime) *elem_multiply* error_signal_in * (rep of previous layer) (rep of previous layer) is recoded as self.x during fprop() process. Here we quantize (rep of previous layer) and leave the rest as it is. """ # the lower 2**(integer power) index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)), T.floor(T.log2(-self.x))) index_low = T.clip(index_low, -4, 3) sign = T.switch(self.x > 0., 1., -1.) #index_up = index_low + 1 # the upper 2**(integer power) though not used explicitly. p_up = sign * self.x / 2**(index_low) - 1 # percentage of upper index. srng = theano.sandbox.rng_mrg.MRG_RandomStreams(self.rng.randint(999999)) index_random = index_low + srng.binomial(n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX) quantized_rep = sign * 2**index_random # there is sth wrong with this self-made backprop: # the code is using BN, but this type of explicit computation is not considering # gradients caused by BN. # error = self.activation_prime(self.z) * error_signal_in error = T.grad(cost=cost, wrt=self.z) self.dEdW = T.dot(quantized_rep.T, error) #self.dEdW = T.dot(self.x.T, error) self.dEdb = T.grad(cost=cost, wrt=self.b) if self.BN == True: self.dEda = T.grad(cost=cost, wrt=self.a)
def quantize_weights(W, srng=None, bitlimit=None, deterministic=False): """ Exponential quantization :param W: Weights :param srng: random number generator :param bitlimit: limit values to be in power of 2 range, e.g. for values in 2^-22 to 2^9 set it to [-22, 9] :param deterministic: deterministic rounding :return: quantized weights """ bitlimit = [-22, 9] #hardcoded for experiments if srng is None: rng = np.random.RandomState(666) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(rng.randint(999999)) if bitlimit: index_low = T.clip( T.switch(W > 0., T.floor(T.log2(W)), T.floor(T.log2(-W))), bitlimit[0], bitlimit[1]) else: index_low = T.switch(W > 0., T.floor(T.log2(W)), T.floor(T.log2(-W))) sign = T.switch(W > 0., 1., -1.) p_up = sign * W / 2**(index_low) - 1 # percentage of upper index. if deterministic: index_deterministic = index_low + T.switch(p_up > 0.5, 1, 0) quantized_W = sign * 2**index_deterministic else: index_random = index_low + srng.binomial( n=1, p=p_up, size=T.shape(W), dtype=theano.config.floatX) quantized_W = sign * 2**index_random return quantized_W
def _compileTheanoFunctions(self): """This methods compiles all theano functions.""" print("Start compiling Theano training function...") D = T.tensor4('data') updates = self._updateWeightsOnMinibatch(D, self.cd_k) self.theano_trainingFct = theano.function([D], None, updates=updates, name='train_CRBM') #compute mean free energy mfe_ = self._meanFreeEnergy(D) #compute number of motif hits [_, H] = self._computeHgivenV(D) #H = self.bottomUpProbability(self.bottomUpActivity(D)) nmh_ = T.mean(H) # mean over samples (K x 1 x N_h) #compute norm of the motif parameters twn_ = T.sqrt(T.mean(self.motifs**2)) #compute information content pwm = self._softmax(self.motifs) entropy = -pwm * T.log2(pwm) entropy = T.sum(entropy, axis=2) # sum over letters ic_= T.log2(self.motifs.shape[2]) - \ T.mean(entropy) # log is possible information due to length of sequence medic_= T.log2(self.motifs.shape[2]) - \ T.mean(T.sort(entropy, axis=2)[:, :, entropy.shape[2] // 2]) self.theano_evaluateData = theano.function([D], [mfe_, nmh_], name='evaluationData') W = T.tensor4("W") self.theano_evaluateParams = theano.function([], [twn_, ic_, medic_], givens={W: self.motifs}, name='evaluationParams') fed = self._freeEnergyForData(D) self.theano_freeEnergy = theano.function([D], fed, name='fe_per_datapoint') fed = self._freeEnergyPerMotif(D) self.theano_fePerMotif = theano.function([D], fed, name='fe_per_motif') if self.doublestranded: self.theano_getHitProbs = theano.function([D], \ self._bottomUpProbability(self._bottomUpActivity(D))) else: self.theano_getHitProbs = theano.function([D], \ #self.bottomUpProbability( T.maximum(self.bottomUpActivity(D), self._bottomUpProbability( self._bottomUpActivity(D) + self._bottomUpActivity(D, True))) print("Compilation of Theano training function finished")
def kl_divergence(rho, rho_cap): """TODO: Docstring for kl_divergence. :rho: TODO :rho_cap: TODO :returns: TODO """ kl = T.sum(rho * T.log2(rho / rho_cap) + (1.5 - rho) * T.log2((1.5 - rho) / (1.5 - rho_cap))) return kl
def OneStep(alpha, b): # minimize alpha alpha_new = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') # minimize b tmp_new = T.clip(W/alpha_new, -1., 1.) b_new = T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.)) b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1)))) delta = T.abs_(alpha_new-alpha) condition = T.lt(delta, 1e-6) return [alpha_new, b_new], theano.scan_module.until(condition)
def compile_entropy_fun(self): p = self.v_samples h = -p*T.log2(p)-(1-p)*T.log2(1-p) h = T.switch(T.isnan(h), 0., h) if self.eval_mask is not None: eval_units = np.sum(self.eval_mask) else: eval_units = np.prod(self.clamp_mask.shape) - np.sum(self.clamp_mask) entropy = T.sum(h) / (self.n_samples * eval_units) self.entropy_fun = theano.function([], entropy)
def hingesig(y_true, y_pred): """Computes the hingeles for a sigmoidal output by apply the logit to y_pred. Note: this function is intended for THEANO. Arguments: y_true -- a theano tensor holding the true labels y_pred -- a theano tensor holding the raw pradictions, i.e. the sigmoid output Returns: theano tensor with hingelosss """ transform_y_true = T.switch(T.eq(y_true, 0), -1, y_true) compl_y_pred = T.clip(T.sub(1., y_pred), 1e-20, 1) y_pred = T.clip(y_pred, 1e-20, 1) logit = (T.log2(y_pred) - T.log2(compl_y_pred)) return T.mean(T.maximum(1. - transform_y_true * logit, 0.), axis=-1)
def __init__(self, sen_vec): # sen_vec is a shared variable of shape (vec_dim, sentence_length) self.sen_vec = assert_op(sen_vec, sen_vec.shape[0] == 50) n_layers = T.log2(sen_vec.shape[1]) self.n_in = sen_vec.shape[1] self.num_layers = assert_op(n_layers, n_layers.get_value() - int(n_layers.get_value()) == 0) self.params = []
def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += (tensor.log(pred[tensor.arange(n_samples), xW] + 1e-20) * m) ppl_sum += -( tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m) return c_sum, ppl_sum
def __init__(self, sen_vec, n_in): self.sen_vec = assert_shape_op(sen_vec, (vec_dims, n_in)) n_layers = T.log2(sen_vec.shape[1]) self.num_layers = T.cast(assert_op(n_layers, T.eq(n_layers, T.cast(n_layers, 'int32'))), 'int32') self.n_in = T.constant(n_in, name='n_in', dtype='int32') self.params = [] self.output = self.tree_output()
def sym_entropy(self, S, mapping): """ Defines the symbolic calculation of the soft entropy """ if self.distance == 'euclidean': distances = euclidean_distance(S, self.C) else: distances = cosine_distance(S, self.C) Q = T.nnet.softmax(-distances / self.m) # Calculates the fuzzy membership vector for each histogram S # Q, scan_u = theano.map(fn=self.sym_get_similarity, sequences=[S]) Nk = T.sum(Q, axis=0) H = T.dot(mapping.T, Q) P = H / Nk entropy_per_cluster = P * T.log2(P) entropy_per_cluster = T.switch(T.isnan(entropy_per_cluster), 0, entropy_per_cluster) entropy_per_cluster = entropy_per_cluster.sum(axis=0) Rk = Nk / Nk.sum() E = -(entropy_per_cluster * Rk).sum() return T.squeeze(E)
def forward_batch_step(x_t, H_mask, H_tm1): H = TT.dot(W_rec,H_tm1) + W_in[:,x_t] H_t = TT.nnet.sigmoid(H) Y_t = TT.nnet.softmax(TT.transpose(TT.dot(W_out, H_t))) Y_t = -TT.log2(Y_t) Y_t = TT.dot(TT.transpose(Y_t), TT.diag(H_mask)) return [H_t, Y_t]
def get_insilico_knockout_tensor_op(lisa_prediction, precompute, coef, original_median=None): """ use theano tensor operation to speed up return a theano.function lisa_prediction: numpy array precompute: numpy array coef: pandas DataFrame """ x = T.imatrix('E') # each motif tensor precomp = theano.shared(precompute.astype(theano.config.floatX), name='precompute') r = theano.shared(lisa_prediction.astype(theano.config.floatX), name='Lisa RP') c = theano.shared(coef.iloc[:, 0].values.astype(theano.config.floatX), name='coefficients') m = theano.shared(original_median.astype(theano.config.floatX), name='original_rp_median') # sample x (gene1_bin1, gene1_bin2...gene2_bin1,gene2_bin2...) y = T.extra_ops.repeat(x, precompute.shape[0], axis=0) tensor_del = y * precomp # sample x (gene,bin) tensor_del = T.reshape(tensor_del, (c.shape[0],r.shape[0],200)) # sample x gene x bin tensor_del = T.transpose(T.sum(tensor_del, axis=2), (1,0)) + T.constant(1) # one motif ##tensor_del_med = T.mean(tensor_del, axis=0) # one motif ##log_tensor_del = T.log2(tensor_del) - T.log2(tensor_del_med) log_tensor_del = T.log2(tensor_del) - m # original median already take log2 tensor_delta = r - T.dot(log_tensor_del, c) mode = theano.Mode(linker='cvm', optimizer='fast_run') theano.config.exception_verbosity = 'high' # theano.config.openmp = True theano_delta_rp = theano.function([x], tensor_delta, mode=mode) return theano_delta_rp
def __call__(self, loss): attention = self.layer.get_attention() +0.000001 attention = attention entropy = -T.sum(T.log2(attention) * attention, axis = 1) entropy = T.mean(entropy) loss+= self.w*entropy return loss
def quantized_bprop(self, cost): index_low = T.switch(self.varin > 0., T.floor(T.log2(self.varin)), T.floor(T.log2(-self.varin)) ) index_low = T.clip(index_low, -4, 3) sign = T.switch(self.varin > 0., 1., -1.) # the upper 2**(integer power) though not used explicitly. # index_up = index_low + 1 # percentage of upper index. p_up = sign * self.varin / 2**(index_low) - 1 index_random = index_low + self.srng.binomial( n=1, p=p_up, size=T.shape(self.varin), dtype=theano.config.floatX) quantized_rep = sign * 2**index_random error = T.grad(cost=cost, wrt=self.varfanin) self.dEdW = T.dot(quantized_rep.T, error)
def __call__(self, loss): attention = self.layer.get_attention() + 0.000001 attention = attention entropy = -T.sum(T.log2(attention) * attention, axis=1) entropy = T.mean(entropy) loss += self.w * entropy return loss
def __init__(self, sen_vec): # sen_vec is a shared variable of shape (vec_dim, sentence_length) self.sen_vec = assert_op(sen_vec, sen_vec.shape[0] == 50) n_layers = T.log2(sen_vec.shape[1]) self.n_in = sen_vec.shape[1] self.num_layers = assert_op( n_layers, n_layers.get_value() - int(n_layers.get_value()) == 0) self.params = []
def cost(self, Y, Y_hat): zeros = tensor.eq(Y, 0) ones = tensor.eq(Y, 1) probs = zeros * Y_hat + ones * (1 - Y_hat) result, _ = theano.scan(fn=lambda vec: -tensor.sum( tensor.log2(vec.nonzero_values())), outputs_info=None, sequences=probs) return result.mean()
def cost(self, Y, Y_hat): zeros = tensor.eq(Y, 0) ones = tensor.eq(Y, 1) probs = zeros * Y_hat + ones * (1 - Y_hat) result, _ = theano.scan( fn=lambda vec: -tensor.sum(tensor.log2(vec.nonzero_values())), outputs_info=None, sequences=probs) return result.mean()
def __init__(self, sen_vec, n_in): self.sen_vec = assert_shape_op(sen_vec, (vec_dims, n_in)) n_layers = T.log2(sen_vec.shape[1]) self.num_layers = T.cast( assert_op(n_layers, T.eq(n_layers, T.cast(n_layers, 'int32'))), 'int32') self.n_in = T.constant(n_in, name='n_in', dtype='int32') self.params = [] self.output = self.tree_output()
def step(input_, label): if self.use_bias: result = T.dot(input_, self.W) + self.b else: result = T.dot(input_, self.W) result = T.nnet.softmax(result) cross_entropy = T.nnet.categorical_crossentropy( T.clip(result, 1e-7, 1.0 - 1e-7), label) # (batch_size,) perplexity = -T.log2(result[T.arange(self.batch_size), label]) # (batch_size,) return cross_entropy, perplexity
def gpu_searchsorted_scan(P, X): N = T.cast(T.floor(T.log2(P.shape[0])) + 1, 'int64') (_, B), _ = theano.scan(gpu_searchsorted_step, outputs_info=[ T.zeros_like(X, dtype='int64'), T.ones_like(X, dtype='int64') * (P.shape[0] - 1) ], non_sequences=[X, P], n_steps=N, allow_gc=True) return B[-1]
def R2_RNN_block(tparams, inputs, prefix=None, name='r2_rnn', std=True): prefix = GetPrefix(prefix, name) n_steps = inputs.shape[0] n_samples = inputs.shape[1] x_size = inputs.shape[2] r_steps = T.ceil(T.log2(n_steps)).astype('uint32') r_steps = T.arange(r_steps) # r_steps=r_steps.reshape([r_steps.shape[0],1]); def _step_inner(index, num, inps): index = index * 2 index_ = T.minimum(index + 2, num) h = RNN_layer(tparams, inps[index:index_, :, :], prefix=prefix, name=None, std=False) return h[-1, :, :] def _step(r_step, num, inps, std=True): n = num steps = T.arange((n + 1) / 2) # steps=steps.reshape([steps.shape[0],1]); out, updates = theano.scan( lambda index, num, inps: _step_inner(index, num, inps), sequences=[steps], outputs_info=None, non_sequences=[num, inps], name=_p(prefix, 'inner_scan'), n_steps=steps.shape[0], profile=False) # if std: out=standardize(out); num = out.shape[0] h = T.zeros_like(inps) h = T.set_subtensor(h[:num], out) return num, h # return out; if std: inputs = standardize(inputs) out, updates = theano.reduce( lambda r_step, num, inps: _step(r_step, num, inps), sequences=r_steps, outputs_info=[inputs.shape[0], inputs], # non_sequences=inputs, name=_p(prefix, 'scan')) return out[1][:out[0]]
def deep_learn(X, y, layer, _iter_num, _alpha, _decay): """TODO: Docstring for deep_learn. :X: TODO :y: TODO :_iter_num: TODO :_alpha: TODO :_decay: TODO :returns: TODO """ init_params = stack_aes(X, y, layer) t_X, t_y, t_z = T.dmatrix(), T.dmatrix(), T.dmatrix() t_m, t_weight_decay, t_b = T.dscalar(), T.dscalar(), T.dscalar() t_params = reduce(add, [[T.dmatrix(), T.dvector()] for i in range(len(init_params)/2)]) t_z = t_X for i in range(0, len(t_params)-2, 2): t_z = T.nnet.sigmoid(T.dot(t_z, t_params[i])+t_params[i+1]) t_z = T.dot(t_z, t_params[-2]) + t_params[-1] J = (-1.0 / t_m) \ * T.sum(T.log2(T.exp(T.sum(t_z * t_y, 1)) / T.sum(T.exp(t_z), 1))) \ + (t_weight_decay / (2.0 * t_m)) * \ T.sum(reduce(add, [T.sum(param ** 2.0) for param in t_params])) formula = theano.function([t_X, t_y] + t_params + [t_weight_decay, t_m], [J] + [T.grad(J, param) for param in t_params]) def cost_func_sm(params): exec compile('tmp = formula(X, y, '+ ''.join(['params[%d], ' % (index) for index in range(len(params))])+ '_decay, X.shape[0])', '', 'exec') in {'formula': formula, 'X': X, 'y': y, '_decay': _decay, 'tmp': None}, locals() J, grads = tmp[0], tmp[1:] return J, grads start_time = time() finale = gradient_descent(cost_func_sm, init_params, _iter_num, _alpha) print 'Training time of sparse linear decoder: %f minutes.' \ %((time() - start_time) / 60.0) print 'The accuracy of dl: %f %% (threshold used)' \ % (assess(y, be_onefold(predict_dl(X, finale), 1), 1)) print 'The accuracy of dl: %f %% (abs used)' \ % (assess(y, be_onefold(predict_dl(X, finale), 1), 2)) return finale
def softmax_classify(X, y, _iter_num, _alpha, _decay, _beta, _rho): """TODO: Docstring for softmax_classify. :X: TODO :y: TODO :options: TODO :returns: TODO """ input_n, output_n = X.shape[1], y.shape[1] in_out_degree = [input_n, output_n] init_params = initial_params(in_out_degree) t_X, t_y = T.dmatrix(), T.dmatrix() t_weight_decay, t_m = T.dscalar(), T.dscalar() t_theta, t_b = T.dmatrix(), T.dvector() z = T.dot(t_X, t_theta) + t_b J = (-1.0 / t_m) \ * T.sum(T.log2(T.exp(T.sum(z * t_y, 1)) / T.sum(T.exp(z), 1))) \ + (t_weight_decay / (2.0 * t_m)) * T.sum(t_theta ** 2.0) formula = theano.function([t_X, t_y, t_theta, t_b, t_weight_decay, t_m], [J, T.grad(J, t_theta), T.grad(J, t_b),]) def cost_func_sm(params): result = formula(X, y, params[0], params[1], _decay, X.shape[0]) J, grads = result[0], result[1:] return J, grads start_time = time() finale = gradient_descent(cost_func_sm, init_params, _iter_num, _alpha) print 'Training time of sparse linear decoder: %f minutes.' \ %((time() - start_time) / 60.0) print 'The accuracy of sm: %f %% (threshold used)' \ % (assess(y, be_onefold(predict_sm(X, finale), 1), 1)) print 'The accuracy of sm: %f %% (abs used)' \ % (assess(y, be_onefold(predict_sm(X, finale), 1), 2)) # sio.savemat('./param/sm_weight_bias', { # 'weight': params[0], 'bias': params[1] # }) return finale[:2]
def cost(X, P): # batch_size x time eps = 1e-3 X = X.T # time x batch_size char_prob_dist = lang_model(X[:-1]) # time x batch_size x output_size char_prob_dist = (1 - 2 * eps) * char_prob_dist + eps label_prob = char_prob_dist[ T.arange(X.shape[0] - 1).dimshuffle(0, 'x'), T.arange(X.shape[1]).dimshuffle('x', 0), X[1:] ] # time x batch_size cross_entropy = -T.sum(T.log(label_prob), axis=0) display_cost = 2**(-T.mean(T.log2(label_prob), axis=0)) l2 = sum(T.sum(p**2) for p in P.values()) cost = cross_entropy if l2_coefficient > 0: cost += l2_coefficient * l2 return cost, display_cost
def R2_RNN_block(tparams,inputs,prefix=None,name='r2_rnn',std=True): prefix=GetPrefix(prefix,name); n_steps=inputs.shape[0]; n_samples=inputs.shape[1]; x_size=inputs.shape[2]; r_steps=T.ceil(T.log2(n_steps)).astype('uint32'); r_steps=T.arange(r_steps); # r_steps=r_steps.reshape([r_steps.shape[0],1]); def _step_inner(index,num,inps): index=index*2; index_=T.minimum(index+2,num); h=RNN_layer(tparams,inps[index:index_,:,:],prefix=prefix,name=None,std=False); return h[-1,:,:]; def _step(r_step,num,inps,std=True): n=num; steps=T.arange((n+1)/2); # steps=steps.reshape([steps.shape[0],1]); out,updates=theano.scan(lambda index,num,inps:_step_inner(index,num,inps), sequences=[steps], outputs_info=None, non_sequences=[num,inps], name=_p(prefix,'inner_scan'), n_steps=steps.shape[0], profile=False); # if std: out=standardize(out); num=out.shape[0]; h=T.zeros_like(inps); h=T.set_subtensor(h[:num],out); return num,h; # return out; if std: inputs=standardize(inputs); out,updates=theano.reduce(lambda r_step,num,inps:_step(r_step,num,inps), sequences=r_steps, outputs_info=[inputs.shape[0],inputs], # non_sequences=inputs, name=_p(prefix,'scan') ); return out[1][:out[0]];
def _sym_entropy(self, S): """ Defines the symbolic calculation of the soft entropy """ distances = symbolic_distance_matrix(S, self.C) Q = T.nnet.softmax(-distances / self.m) # Calculates the fuzzy membership vector for each histogram S Nk = T.sum(Q, axis=0) H = T.dot(self.mapping.T, Q) P = H / Nk entropy_per_cluster = P * T.log2(P) entropy_per_cluster = T.switch(T.isnan(entropy_per_cluster), 0, entropy_per_cluster) entropy_per_cluster = entropy_per_cluster.sum(axis=0) Rk = Nk / Nk.sum() E = -(entropy_per_cluster * Rk).sum() return T.squeeze(E)
def test_lstm(): # load wiki data X_train_np, X_valid_np, X_test_np = gen_data_wiki() batchsize = 100 blocklength = 25000 #450000 bsize_test = batchsize numframe = 100 numframe_test = 1250 #2500#5000 X_valid = onehot(X_valid_np).reshape(bsize_test, X_valid_np.shape[0] / bsize_test, 205) X_test = onehot(X_test_np).reshape(bsize_test, X_test_np.shape[0] / bsize_test, 205) nb_classes = 205 X_train_shared = theano.shared(np.zeros( (batchsize, blocklength, nb_classes)).astype('float32'), name='train_set', borrow=True) X_valid_shared = theano.shared(np.zeros( (bsize_test, numframe_test, nb_classes)).astype('float32'), name='valid_set', borrow=True) X_test_shared = theano.shared(np.zeros( (bsize_test, numframe_test, nb_classes)).astype('float32'), name='test_set', borrow=True) # build the model from keras.layers.recurrent import LSTM, SimpleRNN, LSTMgrave from layer_icml import LSTM_bu, LSTM_td, RNN_td, RNN_bu, RNN_sh, RNN_dp, LSTM_dp, RNN_shallow from layer_icml import RNN_relugate, RNN_ens, RNN_2tanh, RNN_ntanh, RNN_multidp, LSTM_multi, LSTM_u, RNN_utanh, LSTM_uu, LSTM_uugrave from keras.layers.core import Dense, Activation, TimeDistributedDense from keras.initializations import normal, identity x = T.tensor3() y = T.matrix() name_init = 'uniform' n_h = 2450 L1 = LSTMgrave(output_dim=n_h, init='uniform', batchsize=batchsize, inner_init='uniform', input_shape=(None, nb_classes), return_sequences=True) name_model = 'lstm_shallowgrave_' + str( n_h) + name_init + '0.01' + '_batchsize' + str( batchsize) + '_numframe' + str(numframe) # RNN name_act = 'tanh' name_init = 'uniform' #n_h=2048;L1 = RNN_shallow(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_tanh" + str(n_h) + "_"+name_act+ name_init + '0.1' #n_h = 2048;L1 = SimpleRNN(output_dim = n_h, init = 'uniform', inner_init = 'uniform', activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_shallow"+str(n_h)+name_act+ name_init + '0.05' #n_h = 4096;L1 = RNN_utanh(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_utanh_2_0_0" + str(n_h) + "_"+name_act+ name_init +'0.01' n_h = 2048 in_act = 'tanh' L1 = LSTM_uugrave(output_dim=n_h, batchsize=batchsize, init='uniform', inner_init='uniform', input_shape=(None, nb_classes), return_sequences=True) name_model = 'lstm_u_grave' + in_act + '_1.0_1.0_1.0_0' + str( n_h) + name_init + '0.01' + '_batchsize' + str( batchsize) + '_numframe' + str(numframe) #n_h = 1200; in_act = 'tanh';L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform', input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_stack2'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' #n_h = 700; L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L3 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L4 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L5 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= '7005layerlstm_uu_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' D1 = TimeDistributedDense(nb_classes) D1._input_shape = [None, None, n_h] O = Activation('softmax') #layers = [L1, L2, L3, L4, L5, D1, O] layers = [L1, D1, O] #layers = [L1, L2, D1, O] load_model = True if load_model: #f_model = open('/data/lisatmp3/zhangsa/lstm/models/180rnn_td_reluidentityotherinit_identity_sgd0.1_clip10.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune5e-4inorder_withtest.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb') f_model = open( '/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune1e-5inorder_withtest.pkl', 'rb') layers = pickle.load(f_model) f_model.close() name_model_load = 'wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest' + 'finetune2e-6' #name_perpmat_load = 'wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.npy' L1 = layers[0] out = x params = [] for l in layers: if not load_model: l.build() l.input = out params += l.params if l == L1: out = l.get_output()[0] h0 = l.get_output()[0] c0 = l.get_output()[1] else: out = l.get_output() # compute the loss loss = -T.mean(T.log(out)[:, :numframe - 1, :] * x[:, 1:, :]) logperp_valid = T.mean( -T.log2(T.sum(out[:, :numframe_test - 1, :] * x[:, 1:, :], axis=2))) logperp_train = T.mean( -T.log2(T.sum(out[:, :numframe - 1, :] * x[:, 1:, :], axis=2))) # set optimizer from keras.constraints import identity as ident from keras.optimizers import RMSprop, SGD, Adam lr_ = 2 * 1e-6 clipnorm_ = 10000 rmsprop = RMSprop(lr=lr_, clipnrom=clipnorm_) sgd = SGD(lr=lr_, momentum=0.9, clipnorm=clipnorm_) adam = Adam(lr=lr_) #opt = sgd; name_opt = 'sgd'+str(lr_); clip_flag = False #opt = rmsprop; name_opt = 'rmsprop'+str(lr_) opt = adam name_opt = 'adam' + str(lr_) clip_flag = False if clip_flag: name_opt = name_opt + '_clip' + str(clipnorm_) #param update for regular parameters constraints = [ident() for p in params] updates = opt.get_updates(params, constraints, loss) index = T.iscalar() f_train = theano.function( [index], [loss, h0, c0], updates=updates, givens={ x: X_train_shared[:, index * numframe:(index + 1) * numframe, :] }) # perplexity function f_perp_valid = theano.function([], [logperp_valid, h0, c0], givens={x: X_valid_shared}) f_perp_test = theano.function([], [logperp_valid, h0, c0], givens={x: X_test_shared}) #f_perp_valid = theano.function([index], [logperp_valid], givens={x:X_valid_shared[index*bsize_test : (index+1)*bsize_test]}) #f_perp_test = theano.function([index], [logperp_valid], givens={x:X_test_shared[index*bsize_test : (index+1)*bsize_test]}) def perp_valid(): logperp_acc = 0 n = 0 L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for k in xrange(X_valid.shape[1] / numframe_test): X_valid_shared.set_value(X_valid[:, k * numframe_test:(k + 1) * numframe_test, :]) perp, h0, c0 = f_perp_valid() logperp_acc += perp L1.H0.set_value(h0[:, -1, :]) L1.C0.set_value(c0[:, -1, :]) n += 1 return (logperp_acc / n) def perp_test(): logperp_acc = 0 n = 0 L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for k in xrange(X_test.shape[1] / numframe_test): X_test_shared.set_value(X_test[:, k * numframe_test:(k + 1) * numframe_test, :]) perp, h0, c0 = f_perp_test() logperp_acc += perp L1.H0.set_value(h0[:, -1, :]) L1.C0.set_value(c0[:, -1, :]) n += 1 return (logperp_acc / n) #def perp_valid(): # logperp_acc = 0 # n = 0 # for k in xrange(X_valid_np.shape[0]/(bsize_test*numframe_test)): # X_valid_shared.set_value(onehot(X_valid_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205))) # for i in xrange(X_valid_shared.get_value().shape[0]/bsize_test): # logperp_acc += f_perp_valid(i) # n += 1 # return (logperp_acc/n) #def perp_test(): # logperp_acc = 0 # n = 0 # for k in xrange(X_test_np.shape[0]/(bsize_test*numframe_test)): # X_test_shared.set_value(onehot(X_test_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205))) # for i in xrange(X_test_shared.get_value().shape[0]/bsize_test): # logperp_acc += f_perp_test(i) # n += 1 # return (logperp_acc/n) ######## testmodel ######## #test_score = perp_valid() #pdb.set_trace() epoch_ = 9000 perpmat = np.zeros((epoch_, 3)) t_start = time.time() name = 'wiki100' + name_model + '_' + name_opt if load_model: name = name_model_load #perpmat = np.load(name_perpmat_load) #only_block = False #if only_block: # name = name + 'random_only_block' #else: # name = name + 'random_per_row_in_block' name = name + 'inorder' blocksize = batchsize * blocklength bestscore = 100000000 for epoch in xrange(epoch_): for k in xrange(X_train_np.shape[0] / blocksize): t_s = time.time() print "reloading " + str(k) + " th train patch..." #if only_block: # pos = np.random.randint(0, X_train_np.shape[0]-blocksize) # X_train_shared.set_value(onehot(X_train_np[pos: pos + blocksize]).reshape(batchsize, blocklength, 205)) #else: # pos = np.random.randint(0, X_train_np.shape[0]-blocklength, batchsize) # tmp = np.zeros((batchsize, blocklength, 205)).astype('float32') # for j in xrange(batchsize): # tmp[j] = onehot(X_train_np[pos[j]: pos[j] + blocklength]) # X_train_shared.set_value(tmp) X_train_shared.set_value( onehot(X_train_np[k * blocksize:(k + 1) * blocksize]).reshape( batchsize, blocklength, 205)) print "reloading finished, time cost: " + str(time.time() - t_s) L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for i in xrange(blocklength / numframe): loss, h0, c0 = f_train(i) L1.H0.set_value(h0[:, -1, :]) L1.C0.set_value(c0[:, -1, :]) if i % 10 == 0: t_end = time.time() print "Time consumed: " + str(t_end - t_start) + " secs." t_start = time.time() print "Epoch " + str( epoch ) + " " + name + ": The training loss in batch " + str( k * (blocklength / numframe) + i) + " is: " + str(loss) + "." if k % 6 == 0: #save results m = epoch * X_train_np.shape[0] / (blocksize * 6) + k / 6 perpmat[m][0], perpmat[m][1] = 0, perp_valid() perpmat[m][2] = perp_test() np.save( '/data/lisatmp4/zhangsa/rnn_trans/results/' + name + '_withtest.npy', perpmat) #save model if perpmat[m][1] < bestscore: bestscore = perpmat[m][1] f_model = open( '/data/lisatmp4/zhangsa/rnn_trans/models/' + name + '_withtest.pkl', 'wb+') pickle.dump(layers, f_model) f_model.close() print "Epoch "+ str(epoch)+ " " + name + ": The training perp is: " + str(perpmat[epoch][0]) \ + ", test perp is: " + str(perpmat[epoch][1]) + "."
Y = tt.matrix("Y") sigma = tt.vector("sigma") pdist2 = lambda A: ((A[:, np.newaxis, :] - A[np.newaxis, :, :])**2).sum(2) pdist_X = pdist2(X) pdist_Y = pdist2(Y) P_c = tt.exp(-pdist_X / (2 * sigma**2)) / tt.exp(-pdist_X / (2 * sigma**2)).sum(1)[:, np.newaxis] P = (P_c + P_c.T) / (2 * N) Q = tt.exp(-pdist_Y) / tt.exp(-pdist_Y).sum() KL = tt.where(abs(P) > 1e-8, P * tt.log(P / Q), 0).sum(1) C = KL.sum() LogPerplexity = -tt.where(abs(P_c) > 1e-8, P_c * tt.log2(P_c), 0).sum(1) PerplexityCost = 0.5 * ((LogPerplexity - np.log2(perplexity_target))**2).sum() #### Sigma s0 = np.ones(N, np.float32) I = tt.iscalar("I") prp = theano.function([sigma, I], LogPerplexity[I], allow_input_downcast=True) print("Init PC:", PerplexityCost.eval({sigma: s0})) for i in range(N): f = lambda s: (prp(s * np.ones(N), i) - np.log2(perplexity_target)) s0[i] = brentq(f, 1e-6, 10, rtol=1e-8) print("Final PC:", PerplexityCost.eval({sigma: s0})) #### Y f_g = theano.function([Y, sigma], [C, tt.grad(C, Y)])
def __init__(self, nh, nw): """ nh :: dimension of the hidden layer nw :: vocabulary size """ # parameters of the model self.index = theano.shared(name='index', value=numpy.eye(nw, dtype=theano.config.floatX)) self.wxg = theano.shared( name='wxg', value=0.02 * numpy.random.randn(nw, nh).astype(theano.config.floatX)) self.whg = theano.shared( name='whg', value=0.02 * numpy.random.randn(nh, nh).astype(theano.config.floatX)) self.wxi = theano.shared( name='wxi', value=0.02 * numpy.random.randn(nw, nh).astype(theano.config.floatX)) self.whi = theano.shared( name='whi', value=0.02 * numpy.random.randn(nh, nh).astype(theano.config.floatX)) self.wxf = theano.shared( name='wxf', value=0.02 * numpy.random.randn(nw, nh).astype(theano.config.floatX)) self.whf = theano.shared( name='whf', value=0.02 * numpy.random.randn(nh, nh).astype(theano.config.floatX)) self.wxo = theano.shared( name='wxo', value=0.02 * numpy.random.randn(nw, nh).astype(theano.config.floatX)) self.who = theano.shared( name='who', value=0.02 * numpy.random.randn(nh, nh).astype(theano.config.floatX)) self.w = theano.shared( name='w', value=0.02 * numpy.random.randn(nh, nw).astype(theano.config.floatX)) self.bg = theano.shared(name='bg', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bi = theano.shared(name='bi', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bf = theano.shared(name='bf', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bo = theano.shared(name='bo', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.b = theano.shared(name='b', value=numpy.zeros(nw, dtype=theano.config.floatX)) self.h0 = theano.shared(name='h0', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.c0 = theano.shared(name='c0', value=numpy.zeros(nh, dtype=theano.config.floatX)) #bundle self.params = [ self.wxg, self.whg, self.wxi, self.whi, self.wxf, self.whf, self.wxo, self.who, self.w, self.bg, self.bi, self.bf, self.bo, self.b, self.h0, self.c0 ] idxs = T.ivector() x = self.index[idxs] y_sentence = T.ivector('y_sentence') # labels def recurrence(x_t, c_tm1, h_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.wxi) + T.dot(h_tm1, self.whi) + self.bi) f_t = T.nnet.sigmoid( T.dot(x_t, self.wxf) + T.dot(h_tm1, self.whf) + self.bf) o_t = T.nnet.sigmoid( T.dot(x_t, self.wxo) + T.dot(h_tm1, self.who) + self.bo) g_t = T.tanh( T.dot(x_t, self.wxg) + T.dot(h_tm1, self.whg) + self.bg) c_t = f_t * c_tm1 + i_t * g_t h_t = o_t * T.tanh(c_t) s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b) return [c_t, h_t, s_t] [c, h, s], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[self.c0, self.h0, None], n_steps=x.shape[0], truncate_gradient=-1) p_y_given_x_sentence = s[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') sentence_nll = -T.mean( T.log2(p_y_given_x_sentence)[T.arange(x.shape[0]), y_sentence]) sentence_gradients = [ T.grad(sentence_nll, param) for param in self.params ] sentence_updates = [ (param, param - lr * g) for param, g in zip(self.params, sentence_gradients) ] # perplexity of a sentence sentence_ppl = T.pow(2, sentence_nll) # theano functions to compile self.classify = theano.function(inputs=[idxs], outputs=y_pred, allow_input_downcast=True) self.prob_dist = theano.function(inputs=[idxs], outputs=p_y_given_x_sentence, allow_input_downcast=True) self.ppl = theano.function(inputs=[idxs, y_sentence], outputs=sentence_ppl, allow_input_downcast=True) self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr], outputs=sentence_nll, updates=sentence_updates, allow_input_downcast=True)
def word_cost(probs,Y): lbl_probs = probs[T.arange(Y.shape[0]),Y] return -T.sum(T.log(lbl_probs)), -T.mean(T.log2(lbl_probs))
def get(self): y = T.clip(self.result_tensor, EPSILON, 1.0 - EPSILON) y = y.reshape((-1, y.shape[-1])) k = self.index_tensor.reshape((-1, )) return -T.mean(T.log2(y[T.arange(k.shape[0]), k]))
def negative_log_likelihood(self, y): loss = -T.mean(T.log2(self.p_y_given_x)[T.arange(y.shape[0]), y]) return loss
import theano.tensor as T from theano.tensor import shared_randomstreams import numpy as np import numpy.random from scipy.special import gammaincinv from numpy.linalg import norm # tensor stand-in for np.random.RandomState rngT = shared_randomstreams.RandomStreams() rng = numpy.random.RandomState() # {{{ Fastfood Params }}} n, d = T.dscalars('n', 'd') # transform dimensions to be a power of 2 d0, n0 = d, n l = T.ceil(T.log2(d)) # TODO cast to int d = 2**l k = T.ceil(n/d) # TODO cast to int n = d*k # generate parameter 'matrices' B = rng.choice([-1, 1], size=(k, d)) G = rng.normal(size=(k, d), dtype=np.float64) PI = np.array([rng.permutation(d) for _ in xrange(k)]).T S = np.empty((k*d, 1), dtype=np.float64) # generate scaling matrix, S for i in xrange(k): for j in xrange(d): p1 = rng.uniform(size=d) p2 = d/2 Tmp = gammaincinv(p2, p1) Tmp = T.sqrt(2*Tmp)
#train_features = theano.shared(train_features_numpy, name='train_set', borrow=True) valid_features = theano.shared(valid_features_numpy, name='valid_set', borrow=True) model = srnnnobias_scan.SRNN(name="aoeu", numvis=framelen*alphabetsize, numhid=512, numframes=50, cheating_level=1.0, output_type="softmax", numpy_rng=numpy_rng, theano_rng=theano_rng) ppw = 2 ** T.mean( # first mean NLL over each time step prediction, then mean over the whole batch -T.log2( # apply log_2 T.sum( # summing over the 3rd dimention, which has 27 elements (model._prediction_for_training * model._input_frames[1:]), axis=2 ) ) ) #train_perplexity = theano.function([], ppw, givens={model.inputs:train_features}) valid_perplexity = theano.function([], ppw, givens={model.inputs:valid_features}) #model.monitor = model.normalizefilters # TRAIN MODEL trainer = graddescent_rewrite.SGD_Trainer(model, train_features_numpy, batchsize=32, learningrate=0.1, loadsize=10000, gradient_clip_threshold=1.0) print "BEFORE_TRAINING: valid perplexity: %f" % (valid_perplexity()) print 'training...' for epoch in xrange(100):
def negative_log_likelihood(self, y): """ compute negative log-likelihood of target words y explicitly normalize predicted scores """ return -T.mean(T.log2(self.p_w_given_h)[y, T.arange(y.shape[0])])
(x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = np.asarray(x_train, dtype=np.float32) y_train = np.asarray(y_train, dtype=np.int32) x_test = np.asarray(x_test, dtype=np.float32) y_test = np.asarray(y_test, dtype=np.int32) x_train = x_train.reshape((x_train.shape[0], x_train.shape[1] ** 2)) x_test = x_test.reshape((x_test.shape[0], x_test.shape[1] ** 2)) x_train /= 255 x_test /= 255 X = T.matrix() Y = T.matrix() w = theano.shared(np.zeros((28 ** 2, 10), dtype=theano.config.floatX)) log_reg = T.nnet.sigmoid(T.dot(X, w)) cost = T.mean(-Y*T.log2(log_reg)-(1-Y)*T.log2(1-log_reg)) gradient = T.grad(cost=cost, wrt=w) updates = [[w, w - gradient * 0.1]] train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True) predict = theano.function(inputs=[X], outputs=log_reg) calc_accuracy() raw_input('Press enter') errors = [] steps = 200 for i in xrange(0, steps): e = train(x_train, to_output(y_train)) print('{} Error: {}'.format(i, e)) errors.append(e) print('Final mult: {}'.format(w.get_value())) plt.plot(np.asarray(range(0, steps)), np.asarray(errors)) plt.show()
def log2(x): return T.log2(x)
def __init__(self, nh, nw): """ nh :: dimension of the hidden layer nw :: vocabulary size """ # parameters of the model self.index = theano.shared(name='index', value=numpy.eye(nw, dtype=theano.config.floatX)) # parameters of the first LSTM self.wxg_1 = theano.shared(name='wxg_1', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.whg_1 = theano.shared(name='whg_1', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wxi_1 = theano.shared(name='wxi_1', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.whi_1 = theano.shared(name='whi_1', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wxf_1 = theano.shared(name='wxf_1', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.whf_1 = theano.shared(name='whf_1', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wxo_1 = theano.shared(name='wxo_1', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.who_1 = theano.shared(name='who_1', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.bg_1 = theano.shared(name='bg_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bi_1 = theano.shared(name='bi_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bf_1 = theano.shared(name='bf_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bo_1 = theano.shared(name='bo_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.h0_1 = theano.shared(name='h0_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.c0_1 = theano.shared(name='c0_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.wxg_acc_1 = theano.shared(name='wxg_acc_1', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.whg_acc_1 = theano.shared(name='whg_acc_1', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wxi_acc_1 = theano.shared(name='wxi_acc_1', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.whi_acc_1 = theano.shared(name='whi_acc_1', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wxf_acc_1 = theano.shared(name='wxf_acc_1', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.whf_acc_1 = theano.shared(name='whf_acc_1', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wxo_acc_1 = theano.shared(name='wxo_acc_1', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.who_acc_1 = theano.shared(name='who_acc_1', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.bg_acc_1 = theano.shared(name='bg_acc_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bi_acc_1 = theano.shared(name='bi_acc_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bf_acc_1 = theano.shared(name='bf_acc_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bo_acc_1 = theano.shared(name='bo_acc_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.h0_acc_1 = theano.shared(name='h0_acc_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.c0_acc_1 = theano.shared(name='c0_acc_1', value=numpy.zeros(nh, dtype=theano.config.floatX)) # parameters of the second LSTM self.wxg_2 = theano.shared(name='wxg_2', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.whg_2 = theano.shared(name='whg_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wh_1g_2 = theano.shared(name='wh_1g_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wxi_2 = theano.shared(name='wxi_2', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.whi_2 = theano.shared(name='whi_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wh_1i_2 = theano.shared(name='wh_1i_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wxf_2 = theano.shared(name='wxf_2', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.whf_2 = theano.shared(name='whf_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wh_1f_2 = theano.shared(name='wh_1f_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wxo_2 = theano.shared(name='wxo_2', value=0.02 * numpy.random.randn(nw, nh) .astype(theano.config.floatX)) self.who_2 = theano.shared(name='who_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.wh_1o_2 = theano.shared(name='wh_1o_2', value=0.02 * numpy.random.randn(nh, nh) .astype(theano.config.floatX)) self.w_2 = theano.shared(name='w_2', value=0.02 * numpy.random.randn(nh, nw) .astype(theano.config.floatX)) self.bg_2 = theano.shared(name='bg_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bi_2 = theano.shared(name='bi_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bf_2 = theano.shared(name='bf_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bo_2 = theano.shared(name='bo_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.b_2 = theano.shared(name='b_2', value=numpy.zeros(nw, dtype=theano.config.floatX)) self.wxg_acc_2 = theano.shared(name='wxg_acc_2', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.whg_acc_2 = theano.shared(name='whg_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wh_1g_acc_2 = theano.shared(name='wh_1g_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wxi_acc_2 = theano.shared(name='wxi_acc_2', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.whi_acc_2 = theano.shared(name='whi_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wh_1i_acc_2 = theano.shared(name='wh_1i_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wxf_acc_2 = theano.shared(name='wxf_acc_2', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.whf_acc_2 = theano.shared(name='whf_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wh_1f_acc_2 = theano.shared(name='wh_1f_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wxo_acc_2 = theano.shared(name='wxo_acc_2', value=numpy.zeros((nw, nh), dtype=theano.config.floatX)) self.who_acc_2 = theano.shared(name='who_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.wh_1o_acc_2 = theano.shared(name='wh_1o_acc_2', value=numpy.zeros((nh, nh), dtype=theano.config.floatX)) self.w_acc_2 = theano.shared(name='w_acc_2', value=numpy.zeros((nh, nw), dtype=theano.config.floatX)) self.bg_acc_2 = theano.shared(name='bg_acc_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bi_acc_2 = theano.shared(name='bi_acc_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bf_acc_2 = theano.shared(name='bf_acc_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.bo_acc_2 = theano.shared(name='bo_acc_2', value=numpy.zeros(nh, dtype=theano.config.floatX)) self.b_acc_2 = theano.shared(name='b_acc_2', value=numpy.zeros(nw, dtype=theano.config.floatX)) #bundle self.params = [self.wxg_1, self.whg_1, self.wxi_1, self.whi_1, self.wxf_1, self.whf_1, self.wxo_1, self.who_1, self.bg_1, self.bi_1, self.bf_1, self.bo_1, self.h0_1, self.c0_1, self.wxg_2, self.whg_2, self.wh_1g_2, self.wxi_2, self.whi_2, self.wh_1i_2, self.wxf_2, self.whf_2, self.wh_1f_2, self.wxo_2, self.who_2, self.wh_1o_2, self.w_2, self.bg_2, self.bi_2, self.bf_2, self.bo_2, self.b_2] self.params_acc = [self.wxg_acc_1, self.whg_acc_1, self.wxi_acc_1, self.whi_acc_1, self.wxf_acc_1, self.whf_acc_1, self.wxo_acc_1, self.who_acc_1, self.bg_acc_1, self.bi_acc_1, self.bf_acc_1, self.bo_acc_1, self.h0_acc_1, self.c0_acc_1, self.wxg_acc_2, self.whg_acc_2, self.wh_1g_acc_2, self.wxi_acc_2, self.whi_acc_2, self.wh_1i_acc_2, self.wxf_acc_2, self.whf_acc_2, self.wh_1f_acc_2, self.wxo_acc_2, self.who_acc_2, self.wh_1o_acc_2, self.w_acc_2, self.bg_acc_2, self.bi_acc_2, self.bf_acc_2, self.bo_acc_2, self.b_acc_2] idxs = T.ivector() x = self.index[idxs] idxs_r = T.ivector() x_r = self.index[idxs_r] idxs_s = T.ivector() x_s = self.index[idxs_s] y_sentence = T.ivector('y_sentence') # labels def recurrence_1(x_t, c_tm1, h_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.wxi_1) + T.dot(h_tm1, self.whi_1) + self.bi_1) f_t = T.nnet.sigmoid(T.dot(x_t, self.wxf_1) + T.dot(h_tm1, self.whf_1) + self.bf_1) o_t = T.nnet.sigmoid(T.dot(x_t, self.wxo_1) + T.dot(h_tm1, self.who_1) + self.bo_1) g_t = T.tanh(T.dot(x_t, self.wxg_1) + T.dot(h_tm1, self.whg_1) + self.bg_1) c_t = f_t * c_tm1 + i_t * g_t h_t = o_t * T.tanh(c_t) return [c_t, h_t] def recurrence_2(x_t, c_tm1, h_tm1, h_1): i_t = T.nnet.sigmoid(T.dot(x_t, self.wxi_2) + T.dot(h_tm1, self.whi_2) + T.dot(h_1, self.wh_1i_2) + self.bi_2) f_t = T.nnet.sigmoid(T.dot(x_t, self.wxf_2) + T.dot(h_tm1, self.whf_2) + T.dot(h_1, self.wh_1f_2) + self.bf_2) o_t = T.nnet.sigmoid(T.dot(x_t, self.wxo_2) + T.dot(h_tm1, self.who_2) + T.dot(h_1, self.wh_1o_2) + self.bo_2) g_t = T.tanh(T.dot(x_t, self.wxg_2) + T.dot(h_tm1, self.whg_2) + T.dot(h_1, self.wh_1g_2) + self.bg_2) c_t = f_t * c_tm1 + i_t * g_t h_t = o_t * T.tanh(c_t) s_t = T.nnet.softmax(T.dot(h_t, self.w_2) + self.b_2) return [c_t, h_t, s_t] [c_1, h_1], _ = theano.scan(fn=recurrence_1, sequences=x_r, outputs_info=[self.c0_1, self.h0_1], n_steps=x_r.shape[0], truncate_gradient=-1) c_1_last = c_1[-1] h_1_last = h_1[-1] [c_2, h_2, s_2], _ = theano.scan(fn=recurrence_2, sequences=x, non_sequences=[h_1_last], outputs_info=[T.zeros_like(c_1_last), T.zeros_like(h_1_last), None], n_steps=x.shape[0], truncate_gradient=-1) [c_3, h_3, s_3], _ = theano.scan(fn=recurrence_2, sequences=x_s, non_sequences=[h_1_last], outputs_info=[T.zeros_like(c_1_last), T.zeros_like(h_1_last), None], n_steps=x_s.shape[0], truncate_gradient=-1) p_y_given_x_sentence = s_2[:, 0, :] p_y_given_x_sentence2 = s_3[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') sentence_nll = -T.mean(T.log2(p_y_given_x_sentence) [T.arange(x.shape[0]), y_sentence]) sentence_gradients = [T.grad(sentence_nll, param) for param in self.params] #Adagrad sentence_updates = [] for param_i, grad_i, acc_i in zip(self.params, sentence_gradients, self.params_acc): acc = acc_i + T.sqr(grad_i) sentence_updates.append((param_i, param_i - lr*grad_i/(T.sqrt(acc)+1e-5))) sentence_updates.append((acc_i, acc)) # SGD #sentence_updates = [(param, param - lr*g) for param,g in zip(self.params, sentence_gradients)] # theano functions to compile #self.classify = theano.function(inputs=[idxs, idxs_r], outputs=y_pred, allow_input_downcast=True) #self.prob_dist = theano.function(inputs=[idxs, idxs_r], outputs=p_y_given_x_sentence, allow_input_downcast=True) self.prob_dist2 = theano.function(inputs=[idxs_r, idxs_s], outputs=p_y_given_x_sentence2, allow_input_downcast=True) self.nll = theano.function(inputs=[idxs, idxs_r, y_sentence], outputs=sentence_nll, allow_input_downcast=True) self.sentence_train = theano.function(inputs=[idxs, idxs_r, y_sentence, lr], outputs=sentence_nll, updates=sentence_updates, allow_input_downcast=True) self.sent_vec = theano.function(inputs=[idxs_r], outputs=h_1[-1], allow_input_downcast=True)
def get(self): y = T.clip(self.result_tensor, EPSILON, 1.0 - EPSILON) y = y.reshape((-1, y.shape[-1])) k = self.index_tensor.reshape((-1,)) return -T.mean(T.log2(y[T.arange(k.shape[0]), k]))
def unnormalized_neg_log_likelihood(self, y, c=1.0): """ compute unnormalized log-likelihood of target words y """ return -T.mean(T.log2(self.s * T.exp(c))[y, T.arange(y.shape[0])])
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) emb_rev = tensor.set_subtensor( embW_rev[mask[::-1, :].argmax(axis=0) - 1, tensor.arange(n_samples), :], embImg[0, :, :]) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp=emb.shape) if options.get('en_aux_inp', 0): xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp=xAuxEmb.shape) ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix='lstm', sched_prob_mask=[]) ############################################################################################################################# # This implements core reverse lstm rev_rval, rev_updatesLSTM = basic_lstm_layer( tparams, emb_rev[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix='rev_lstm', sched_prob_mask=[]) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) rev_p = dropout_layer( sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) rev_p = sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1), options['hidden_size']) n_out_samps = (n_timesteps - 2) * n_samples if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :], tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo']) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * (p.reshape([1, n_out_samps, options['hidden_size']]))).sum( axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten() tot_cost = -(probs_valid.sum()) tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:-1, :].flatten()).sum() cost = [tot_cost / options['batch_size'], tot_pplx] inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) if options.get('sched_sampling_mode', None) != None: inp_list.append(curr_epoch) per_sent_prob = probs_valid.reshape([n_timesteps - 2, n_samples]).sum(axis=0) f_per_sentLogP = theano.function(inp_list, per_sent_prob, name='f_pred_logprob', updates=updatesLSTM) f_pred_prob = ['', f_per_sentLogP, ''] return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
valid_features = theano.shared(valid_features_numpy, name='valid_set', borrow=True) model = srnnnobias_scan.SRNN(name="aoeu", numvis=framelen * alphabetsize, numhid=512, numframes=50, cheating_level=1.0, output_type="softmax", numpy_rng=numpy_rng, theano_rng=theano_rng) ppw = 2**T.mean( # first mean NLL over each time step prediction, then mean over the whole batch -T.log2( # apply log_2 T.sum( # summing over the 3rd dimention, which has 27 elements (model._prediction_for_training * model._input_frames[1:]), axis=2))) #train_perplexity = theano.function([], ppw, givens={model.inputs:train_features}) valid_perplexity = theano.function([], ppw, givens={model.inputs: valid_features}) #model.monitor = model.normalizefilters # TRAIN MODEL trainer = graddescent_rewrite.SGD_Trainer(model, train_features_numpy, batchsize=32, learningrate=0.1, loadsize=10000, gradient_clip_threshold=1.0)
def test_lstm(): # load wiki data X_train_np, X_valid_np, X_test_np = gen_data_wiki() batchsize = 100 blocklength = 25000 #450000 bsize_test = batchsize numframe = 100 numframe_test = 1250#2500#5000 X_valid = onehot(X_valid_np).reshape(bsize_test, X_valid_np.shape[0]/bsize_test, 205) X_test = onehot(X_test_np).reshape(bsize_test, X_test_np.shape[0]/bsize_test, 205) nb_classes= 205 X_train_shared = theano.shared(np.zeros((batchsize,blocklength, nb_classes)).astype('float32'), name = 'train_set', borrow=True) X_valid_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'valid_set', borrow=True) X_test_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'test_set', borrow=True) # build the model from keras.layers.recurrent import LSTM, SimpleRNN, LSTMgrave from layer_icml import LSTM_bu, LSTM_td, RNN_td, RNN_bu, RNN_sh, RNN_dp, LSTM_dp, RNN_shallow from layer_icml import RNN_relugate, RNN_ens, RNN_2tanh, RNN_ntanh, RNN_multidp, LSTM_multi, LSTM_u, RNN_utanh, LSTM_uu, LSTM_uugrave from keras.layers.core import Dense, Activation, TimeDistributedDense from keras.initializations import normal, identity x = T.tensor3() y = T.matrix() name_init = 'uniform' n_h = 2450; L1 = LSTMgrave(output_dim = n_h, init = 'uniform', batchsize = batchsize, inner_init = 'uniform',input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_shallowgrave_' + str(n_h) + name_init + '0.01'+ '_batchsize' + str(batchsize) + '_numframe' + str(numframe) # RNN name_act = 'tanh'; name_init = 'uniform' #n_h=2048;L1 = RNN_shallow(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_tanh" + str(n_h) + "_"+name_act+ name_init + '0.1' #n_h = 2048;L1 = SimpleRNN(output_dim = n_h, init = 'uniform', inner_init = 'uniform', activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_shallow"+str(n_h)+name_act+ name_init + '0.05' #n_h = 4096;L1 = RNN_utanh(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_utanh_2_0_0" + str(n_h) + "_"+name_act+ name_init +'0.01' n_h = 2048; in_act = 'tanh';L1 = LSTM_uugrave(output_dim = n_h, batchsize = batchsize, init = 'uniform', inner_init = 'uniform', input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_u_grave'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' + '_batchsize' + str(batchsize) + '_numframe' + str(numframe) #n_h = 1200; in_act = 'tanh';L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform', input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_stack2'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' #n_h = 700; L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L3 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L4 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L5 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= '7005layerlstm_uu_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' D1 = TimeDistributedDense(nb_classes);D1._input_shape = [None, None, n_h] O = Activation('softmax') #layers = [L1, L2, L3, L4, L5, D1, O] layers = [L1, D1, O] #layers = [L1, L2, D1, O] load_model = True if load_model: #f_model = open('/data/lisatmp3/zhangsa/lstm/models/180rnn_td_reluidentityotherinit_identity_sgd0.1_clip10.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune5e-4inorder_withtest.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb') f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune1e-5inorder_withtest.pkl', 'rb') layers = pickle.load(f_model) f_model.close() name_model_load = 'wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest' + 'finetune2e-6' #name_perpmat_load = 'wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.npy' L1 = layers[0] out = x params = [] for l in layers: if not load_model: l.build() l.input = out params += l.params if l == L1: out = l.get_output()[0] h0 = l.get_output()[0] c0 = l.get_output()[1] else: out = l.get_output() # compute the loss loss = -T.mean(T.log(out)[:,:numframe-1,:] *x[:,1:,:]) logperp_valid = T.mean(-T.log2(T.sum(out[:,:numframe_test-1,:]*x[:,1:,:],axis=2))) logperp_train = T.mean(-T.log2(T.sum(out[:,:numframe-1,:]*x[:,1:,:],axis=2))) # set optimizer from keras.constraints import identity as ident from keras.optimizers import RMSprop, SGD, Adam lr_ = 2*1e-6 clipnorm_ = 10000 rmsprop = RMSprop(lr=lr_, clipnrom = clipnorm_) sgd = SGD(lr=lr_, momentum=0.9, clipnorm=clipnorm_) adam = Adam(lr=lr_) #opt = sgd; name_opt = 'sgd'+str(lr_); clip_flag = False #opt = rmsprop; name_opt = 'rmsprop'+str(lr_) opt = adam; name_opt = 'adam' + str(lr_); clip_flag = False if clip_flag: name_opt = name_opt + '_clip'+str(clipnorm_) #param update for regular parameters constraints = [ident() for p in params] updates = opt.get_updates(params, constraints, loss) index = T.iscalar() f_train = theano.function([index], [loss, h0, c0], updates = updates, givens={x:X_train_shared[:,index*numframe : (index+1)*numframe, :]}) # perplexity function f_perp_valid = theano.function([], [logperp_valid, h0, c0], givens={x:X_valid_shared}) f_perp_test = theano.function([], [logperp_valid, h0, c0], givens={x:X_test_shared}) #f_perp_valid = theano.function([index], [logperp_valid], givens={x:X_valid_shared[index*bsize_test : (index+1)*bsize_test]}) #f_perp_test = theano.function([index], [logperp_valid], givens={x:X_test_shared[index*bsize_test : (index+1)*bsize_test]}) def perp_valid(): logperp_acc = 0 n = 0 L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for k in xrange(X_valid.shape[1]/numframe_test): X_valid_shared.set_value(X_valid[:, k*numframe_test:(k+1)*numframe_test, :]) perp, h0, c0 = f_perp_valid() logperp_acc += perp L1.H0.set_value(h0[:,-1,:]) L1.C0.set_value(c0[:,-1,:]) n += 1 return (logperp_acc/n) def perp_test(): logperp_acc = 0 n = 0 L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for k in xrange(X_test.shape[1]/numframe_test): X_test_shared.set_value(X_test[:, k*numframe_test:(k+1)*numframe_test, :]) perp, h0, c0 = f_perp_test() logperp_acc += perp L1.H0.set_value(h0[:,-1,:]) L1.C0.set_value(c0[:,-1,:]) n += 1 return (logperp_acc/n) #def perp_valid(): # logperp_acc = 0 # n = 0 # for k in xrange(X_valid_np.shape[0]/(bsize_test*numframe_test)): # X_valid_shared.set_value(onehot(X_valid_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205))) # for i in xrange(X_valid_shared.get_value().shape[0]/bsize_test): # logperp_acc += f_perp_valid(i) # n += 1 # return (logperp_acc/n) #def perp_test(): # logperp_acc = 0 # n = 0 # for k in xrange(X_test_np.shape[0]/(bsize_test*numframe_test)): # X_test_shared.set_value(onehot(X_test_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205))) # for i in xrange(X_test_shared.get_value().shape[0]/bsize_test): # logperp_acc += f_perp_test(i) # n += 1 # return (logperp_acc/n) ######## testmodel ######## #test_score = perp_valid() #pdb.set_trace() epoch_ = 9000 perpmat = np.zeros((epoch_, 3)) t_start = time.time() name = 'wiki100'+ name_model + '_' + name_opt if load_model: name = name_model_load #perpmat = np.load(name_perpmat_load) #only_block = False #if only_block: # name = name + 'random_only_block' #else: # name = name + 'random_per_row_in_block' name = name+'inorder' blocksize = batchsize*blocklength bestscore = 100000000 for epoch in xrange(epoch_): for k in xrange(X_train_np.shape[0]/blocksize): t_s = time.time() print "reloading " + str(k) + " th train patch..." #if only_block: # pos = np.random.randint(0, X_train_np.shape[0]-blocksize) # X_train_shared.set_value(onehot(X_train_np[pos: pos + blocksize]).reshape(batchsize, blocklength, 205)) #else: # pos = np.random.randint(0, X_train_np.shape[0]-blocklength, batchsize) # tmp = np.zeros((batchsize, blocklength, 205)).astype('float32') # for j in xrange(batchsize): # tmp[j] = onehot(X_train_np[pos[j]: pos[j] + blocklength]) # X_train_shared.set_value(tmp) X_train_shared.set_value(onehot(X_train_np[k*blocksize: (k+1)*blocksize]).reshape(batchsize, blocklength, 205)) print "reloading finished, time cost: " + str(time.time()-t_s) L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for i in xrange(blocklength/numframe): loss, h0, c0 = f_train(i) L1.H0.set_value(h0[:,-1,:]) L1.C0.set_value(c0[:,-1,:]) if i%10 == 0: t_end = time.time() print "Time consumed: " + str(t_end - t_start) + " secs." t_start = time.time() print "Epoch "+ str(epoch)+" " + name + ": The training loss in batch " + str(k*(blocklength/numframe)+i) +" is: " + str(loss) + "." if k%6 == 0: #save results m = epoch*X_train_np.shape[0]/(blocksize*6) +k/6 perpmat[m][0], perpmat[m][1] = 0, perp_valid() perpmat[m][2] = perp_test() np.save('/data/lisatmp4/zhangsa/rnn_trans/results/' + name +'_withtest.npy', perpmat) #save model if perpmat[m][1] < bestscore: bestscore = perpmat[m][1] f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/' + name + '_withtest.pkl', 'wb+') pickle.dump(layers, f_model) f_model.close() print "Epoch "+ str(epoch)+ " " + name + ": The training perp is: " + str(perpmat[epoch][0]) \ + ", test perp is: " + str(perpmat[epoch][1]) + "."
outputs_info=[H0, None] ) #[h_ts_fb, y_predicted_fb], _ = T.scan(forward_step, # sequences=[x], # outputs_info=[h0, None] # ) logprobs = y_predicted[TT.arange(Y.shape[0]), TT.transpose(Y), TT.reshape(TT.arange(n_minibatches),(n_minibatches,1))] DENOM_th = TT.diag(1/TT.sum(logprobs>0, axis=1).astype('float32')) cross_entropy = TT.sum(TT.dot(DENOM_th,logprobs)) / n_minibatches #cross_entropy = -TT.mean(TT.log2(TT.nonzero_values(y_predicted[TT.arange(Y.shape[0]), TT.transpose(Y), TT.reshape(TT.arange(n_minibatches),(n_minibatches,1))]))) #cross_entropy_fb = -TT.mean(TT.log2(y_predicted_fb)[TT.arange(y.shape[0]), y]) cross_entropy_fb = -TT.log2(y_t_fb)[y] params = [W_in, W_rec, W_out] theta_updates = {W_in: W_in_theta_update, W_rec: W_rec_theta_update, W_out: W_out_theta_update} g_params = [] for param in params: g_params.append(TT.grad(cross_entropy, param)) T.pp(TT.grad(cross_entropy, param)) updates = [] for param, grad in zip(params, g_params): theta_update = theta_updates[param] upd = mom * theta_update - lr * grad updates.append((theta_updates[param], upd)) updates.append((param, param + upd,))
def __init__(self, inp_shape, output_num, training_size, stride=(4, 2), untie_biases=False): # setup shared vars self.state = theano.shared(np.zeros((1, inp_shape[1], inp_shape[2], inp_shape[3]), dtype=theano.config.floatX)) self.training_states = theano.shared(np.zeros((training_size, inp_shape[1], inp_shape[2], inp_shape[3]), dtype=theano.config.floatX)) self.training_actions = theano.shared(np.zeros(training_size, dtype=np.int32)) self.training_rewards = theano.shared(np.zeros(training_size, dtype=theano.config.floatX)) network_dic = create_A3C(inp_shape, output_num, stride=stride, untie_biases=untie_biases) self.l_in = network_dic['l_in'] self.l_hid1 = network_dic['l_hid1'] self.l_hid2 = network_dic['l_hid2'] self.l_hid3 = network_dic['l_hid3'] self.l_policy = network_dic['l_policy'] self.l_value = network_dic['l_value'] # network output vars policy_output = lasagne.layers.get_output(self.l_policy, inputs=self.state) value_output = lasagne.layers.get_output(self.l_value, inputs=self.state) # setup training vars and loss training_policy_output = lasagne.layers.get_output(self.l_policy, inputs=self.training_states) training_value_output = lasagne.layers.get_output(self.l_value, inputs=self.training_states) # log(prediction, action taken) * (R - Value(states)) # one_hot_true = T.zeros_like(training_policy_output) # one_hot_true = T.set_subtensor(one_hot_true[T.arange(self.training_actions.shape[0]), self.training_actions], 1) # rewrite categorical crossentropy here because the lasagne/theano function sums the result and I need per step # categorical_crossentropy = -T.sum(one_hot_true * T.log(training_policy_output), axis=1) entropy = 0.01 * -T.sum(training_policy_output * T.log2(training_policy_output), axis=1) value_diff_rewards = (self.training_rewards - training_value_output[:, 0] + entropy) # sum is to aggregate over the nsteps policy_loss = T.sum(T.log(training_policy_output[:, self.training_actions]) * value_diff_rewards) value_loss = T.sum((self.training_rewards - training_value_output[:, 0])**2) # get layer parms policy_params = lasagne.layers.get_all_params(self.l_policy) value_params = lasagne.layers.get_all_params(self.l_value) params = policy_params + self.l_value.get_params() # get grads policy_grads = T.grad(policy_loss, policy_params) value_grads = T.grad(value_loss, value_params) # combine grads for the non-output layers combine_grads = policy_grads[0:-2] for grad_ind in range(len(value_grads)-2): combine_grads[grad_ind] += value_grads[grad_ind] # add grads for policy and value layers grads = combine_grads + policy_grads[-2:] + value_grads[-2:] # add loss to return in grads list grads.append(policy_loss) grads.append(value_loss) # updates self.w1_update = theano.shared(np.zeros(self.l_hid1.W.eval().shape, dtype=theano.config.floatX)) self.w2_update = theano.shared(np.zeros(self.l_hid2.W.eval().shape, dtype=theano.config.floatX)) if untie_biases: self.b1_update = theano.shared(np.zeros(self.l_hid1.b.eval().shape, dtype=theano.config.floatX)) self.b2_update = theano.shared(np.zeros(self.l_hid2.b.eval().shape, dtype=theano.config.floatX)) else: self.b1_update = theano.shared(np.zeros(self.l_hid1.b.eval().shape, dtype=theano.config.floatX)) self.b2_update = theano.shared(np.zeros(self.l_hid2.b.eval().shape, dtype=theano.config.floatX)) self.w3_update = theano.shared(np.zeros(self.l_hid3.W.eval().shape, dtype=theano.config.floatX)) self.b3_update = theano.shared(np.zeros(self.l_hid3.b.eval().shape, dtype=theano.config.floatX)) self.l_policy_w_update = theano.shared(np.zeros(self.l_policy.W.eval().shape, dtype=theano.config.floatX)) self.l_policy_b_update = theano.shared(np.zeros(self.l_policy.b.eval().shape, dtype=theano.config.floatX)) self.l_value_w_update = theano.shared(np.zeros(self.l_value.W.eval().shape, dtype=theano.config.floatX)) self.l_value_b_update = theano.shared(np.zeros(self.l_value.b.eval().shape, dtype=theano.config.floatX)) network_updates = [self.w1_update, self.b1_update, self.w2_update, self.b2_update, self.w3_update, self.b3_update, self.l_policy_w_update, self.l_policy_b_update, self.l_value_w_update, self.l_value_b_update] theano_updates = lasagne.updates.rmsprop(network_updates, params, 0.0001) self._get_policy_output = theano.function([], policy_output) self._get_value_output = theano.function([], value_output) self._gradient_step = theano.function([], updates=theano_updates) self._get_grads = theano.function([], outputs=grads) self.accumulated_grads = None
def norm_entropy(dist): return entropy(dist) / T.log2(dist.shape[-1])
import theano import theano.tensor as T """ A bunch of loss definitions for ease """ # zero one loss zero_one_loss = lambda x, y: T.sum(T.neq(T.argmax(x), y)) # log loss or cross entropy error cross_entropy = lambda x, y: -T.mean(T.log2(x[T.arange(0, y.shape[0]), y])) # mean squarred error mse = lambda x, y: T.mean(T.square(x - y))
def negative_log_likelihood(self, y): # take the logarithm with base 2 return -T.mean(T.log2(self.p_w_given_h)[T.arange(y.shape[0]), y])
def accumCost(pred,xW,m,c_sum,ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m) ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m) return c_sum, ppl_sum
def kullback_leibler(dist1, dist2): logged = T.log2(dist1 / dist2) # let 0 * log(0) -> 0 #logged = T.set_subtensor(logged[T.eq(dist1, 0).nonzero()], 0) return (dist1 * logged).sum(axis=1)