def __init__(self, rng, input, nhistory, feature,n_feat, n_in, n_out, N=4096, W=None, sparse=None,activation=None): self.input = input if W is None: W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (N*n_in + n_out)), high=numpy.sqrt(6. / (N*n_in + n_out)), size=(N*n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) else: w_values=W W = theano.shared(value=w_values, name='W',borrow=True) self.W = W lin_output = T.dot(self.input, self.W) for history in nhistory: lin_output = T.concatenate((lin_output,T.dot(history,self.W)),axis=1) if n_feat==0: self.output = lin_output else: self.output = T.concatenate((lin_output,feature),axis=1) # parameters of the model self.params = [self.W]
def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1) if self.celltype==RNN: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size, name="RNN.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size, name="RNN.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] if self.celltype==LSTM: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] self.layerstatus=self.model.forward(x,init_hiddens) #results.shape?40*1 self.results=self.layerstatus[-1] if self.steps > 1: self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) #前传之后step-2次 for i in xrange(2,self.steps): self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results
def transform(self, inputs): '''Transform the inputs for this layer into an output for the layer. Parameters ---------- inputs : dict of theano expressions Symbolic inputs to this layer, given as a dictionary mapping string names to Theano expressions. See :func:`base.Layer.connect`. Returns ------- outputs : dict of theano expressions Theano expressions representing the output from the layer. This layer type produces an "out" output that concatenates the outputs from its underlying workers. If present, it also concatenates the "pre" and "cell" outputs from the underlying workers. Finally, it passes along the individual outputs from its workers using "fw" and "bw" prefixes for forward and backward directions. updates : list of update pairs A list of state updates to apply inside a theano function. ''' fout, fupd = self.forward.transform(inputs) bout, bupd = self.backward.transform(inputs) outputs = dict(out=TT.concatenate([fout['out'], bout['out']], axis=2)) if 'pre' in fout: outputs['pre'] = TT.concatenate([fout['pre'], bout['pre']], axis=2) if 'cell' in fout: outputs['cell'] = TT.concatenate([fout['cell'], bout['cell']], axis=2) for k, v in fout.items(): outputs['fw_{}'.format(k)] = v for k, v in bout.items(): outputs['bw_{}'.format(k)] = v return outputs, fupd + bupd
def recurrence( sample_z_t, sample_x_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec, mu_z_t, mu_x_tm1, coeff_x_tm1, v): v_hat = v - T.sum(( coeff_x_tm1.dimshuffle(0,'x',1) * ( mu_x_tm1 + (T.exp(b_sig_x) * sample_x_t).reshape((batch_size, n_visible*n_gmm)) ).reshape((batch_size, n_visible, n_gmm)) ), axis = -1 ) #error input r_t = T.concatenate( [v , v_hat], axis = 1 ) # v_enc = [r_t, h_tm1_dec] v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1) #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc) i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc)) f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc)) c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc )) o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc)) h_t_enc = o_t_enc * T.tanh( c_t_enc ) # Get z_t mu_z_t = T.dot(h_t_enc, Wh_enc_mu_z ) + b_mu_z #sigma_z_t = T.dot(h_t_enc, Wh_enc_sig_z ) + b_sig_z #sample = theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX) z_t = mu_z_t + (T.exp(b_sig_z) * sample_z_t).reshape((batch_size,n_z)) # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec)) f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec)) c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec )) o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec)) h_t_dec = o_t_dec * T.tanh( c_t_dec ) # Get w_t mu_x_t = mu_x_tm1 + T.dot(h_t_dec, Wh_dec_mu_x) + b_mu_x coeff_x_t = T.nnet.softmax( T.dot(h_t_dec, Wh_dec_coeff_x) + b_coeff_x) #sigma_x_t = sigma_x_tm1 + T.dot(h_t_dec, Wh_dec_sigma_x) + b_sig_x return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec, mu_z_t, mu_x_t , coeff_x_t]
def get_uhs_operator(uhs, depth, n_hidden, rhos): """ :param uhs: :param depth: :param n_hidden: :param rhos: can be shared variable or constant of shape (depth, )!! :return: """ # Will use a Fourier matrix (will be O(n^2)...) # Doesn't seem to slow things down much though! exp_phases = [T.cos(uhs), T.sin(uhs)] neg_exp_phases = [T.cos(uhs[:, ::-1]), -T.sin(uhs[:, ::-1])] ones_ = [T.ones((depth, 1), dtype=theano.config.floatX), T.zeros((depth, 1), dtype=theano.config.floatX)] rhos_reshaped = T.reshape(rhos, (depth, 1), ndim=2) rhos_reshaped = T.addbroadcast(rhos_reshaped, 1) eigvals_re = rhos_reshaped * T.concatenate((ones_[0], exp_phases[0], -ones_[0], neg_exp_phases[0]), axis=1) eigvals_im = rhos_reshaped * T.concatenate((ones_[1], exp_phases[1], -ones_[1], neg_exp_phases[1]), axis=1) phase_array = -2 * np.pi * np.outer(np.arange(n_hidden), np.arange(n_hidden)) / n_hidden f_array_re_val = np.cos(phase_array) / n_hidden f_array_im_val = np.sin(phase_array) / n_hidden f_array_re = theano.shared(f_array_re_val.astype(theano.config.floatX), name="f_arr_re") f_array_im = theano.shared(f_array_im_val.astype(theano.config.floatX), name="f_arr_im") a_k = T.dot(eigvals_re, f_array_re) + T.dot(eigvals_im, f_array_im) uhs_op = rep_vec(a_k, n_hidden, n_hidden) # shape (depth, 2 * n_hidden - 1) return uhs_op
def predict(self, new_data, batch_size, pool_size): """ predict for new data """ img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3]) conv_out = conv.conv2d(input=new_data, filters=self.W, filter_shape=self.filter_shape, image_shape=img_shape) pool_list = [] if self.non_linear == "tanh": conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle("x", 0, "x", "x")) # pad_len = int(self.max_window_len/2) # right_pad_len = int(self.filter_shape[2]/2) # index_shift = pad_len-right_pad_len index_shift = int(self.filter_shape[2] / 2) for i in xrange(batch_size): # partition sentence via pool size e1pos = pool_size[i, 0] + index_shift e2pos = pool_size[i, 1] + index_shift # if T.gt(e1pos, 0): # p1 = conv_out_tanh[i, :, :e1pos, :] # else: # p1 = conv_out_tanh[i, :, 0, :] p1 = conv_out_tanh[i, :, :e1pos, :] p2 = conv_out_tanh[i, :, e1pos:e2pos, :] p3 = conv_out_tanh[i, :, e2pos:, :] p1_pool_out = T.max(p1, axis=1) p2_pool_out = T.max(p2, axis=1) p3_pool_out = T.max(p3, axis=1) temp = T.concatenate([p1_pool_out, p2_pool_out, p3_pool_out], axis=1) pool_list.append(temp.dimshuffle("x", 0, 1)) else: pass output = T.concatenate(pool_list, axis=0) return output
def output_probabilistic(self, m_w_previous, v_w_previous): if (self.non_linear): m_in = self.m_w - m_w_previous v_in = self.v_w # We compute the mean and variance after the ReLU activation lam = self.lam v_1 = 1 + 2*lam*v_in v_1_inv = v_1**-1 s_1 = T.prod(v_1,axis=1)**-0.5 v_2 = 1 + 4*lam*v_in v_2_inv = v_2**-1 s_2 = T.prod(v_2,axis=1)**-0.5 v_inv = v_in**-1 exponent1 = m_in**2*(1 - v_1_inv)*v_inv exponent1 = T.sum(exponent1,axis=1) exponent2 = m_in**2*(1 - v_2_inv)*v_inv exponent2 = T.sum(exponent2,axis=1) m_a = s_1*T.exp(-0.5*exponent1) v_a = s_2*T.exp(-0.5*exponent2) - m_a**2 return (m_a, v_a) else: m_w_previous_with_bias = \ T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0) v_w_previous_with_bias = \ T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0) m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs) v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \ T.dot(self.m_w**2, v_w_previous_with_bias) + \ T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs return (m_linear, v_linear)
def _best_path_decode(activations): """Calculate the CTC best-path decoding for a given activation sequence. In the returned matrix, shorter sequences are padded with -1s.""" # For each timestep, get the highest output decoding = T.argmax(activations, axis=2) # prev_outputs[time][example] == decoding[time - 1][example] prev_outputs = T.concatenate([T.alloc(_BLANK, 1, decoding.shape[1]), decoding], axis=0)[:-1] # Filter all repetitions to zero (blanks are already zero) decoding = decoding * T.neq(decoding, prev_outputs) # Calculate how many blanks each sequence has relative to longest sequence blank_counts = T.eq(decoding, 0).sum(axis=0) min_blank_count = T.min(blank_counts, axis=0) max_seq_length = decoding.shape[0] - min_blank_count # used later padding_needed = blank_counts - min_blank_count # Generate the padding matrix by ... doing tricky things max_padding_needed = T.max(padding_needed, axis=0) padding_needed = padding_needed.dimshuffle('x',0).repeat(max_padding_needed, axis=0) padding = T.arange(max_padding_needed).dimshuffle(0,'x').repeat(decoding.shape[1],axis=1) padding = PADDING * T.lt(padding, padding_needed) # Apply the padding decoding = T.concatenate([decoding, padding], axis=0) # Remove zero values nonzero_vals = decoding.T.nonzero_values() decoding = T.reshape(nonzero_vals, (decoding.shape[1], max_seq_length)).T return decoding
def _create_maximum_activation_update(output, record, streamindex, topn): """ Calculates update of the topn maximums for one batch of outputs. """ dims, maximums, indices, snapshot = record counters = tensor.tile(tensor.shape_padright( tensor.arange(output.shape[0]) + streamindex), (1, output.shape[1])) if len(dims) == 1: # output is a 2d tensor, (cases, units) -> activation tmax = output # counters is a 2d tensor broadcastable (cases, units) -> case_index tind = counters else: # output is a 4d tensor: fmax flattens it to 3d fmax = output.flatten(ndim=3) # fargmax is a 2d tensor containing rolled maximum locations fargmax = fmax.argmax(axis=2) # fetch the maximum. tmax is 2d, (cases, units) -> activation tmax = _apply_index(fmax, fargmax, axis=2) # targmax is a tuple that separates rolled-up location into (x, y) targmax = divmod(fargmax, dims[2]) # tind is a 3d tensor (cases, units, 3) -> case_index, maxloc # this will match indices which is a 3d tensor also tind = tensor.stack((counters, ) + targmax, axis=2) cmax = tensor.concatenate((maximums, tmax), axis=0) cind = tensor.concatenate((indices, tind), axis=0) cargsort = (-cmax).argsort(axis=0)[:topn] newmax = _apply_perm(cmax, cargsort, axis=0) newind = _apply_perm(cind, cargsort, axis=0) updates = [(maximums, newmax), (indices, newind)] if snapshot: csnap = tensor.concatenate((snapshot, output), axis=0) newsnap = _apply_perm(csnap, cargsort, axis=0) updates.append((snapshot, newsnap)) return updates
def create_TrainFunc_tranPES(simfn, embeddings, marge=0.5, alpha=1., beta=1.): # parse the embedding data embedding = embeddings[0] # D x N matrix lembedding = embeddings[1] # declare the symbolic variables for training triples hp = S.csr_matrix('head positive') # N x batchsize matrix rp = S.csr_matrix('relation') tp = S.csr_matrix('tail positive') hn = S.csr_matrix('head negative') tn = S.csr_matrix('tail negative') lemb = T.scalar('embedding learning rate') lremb = T.scalar('relation learning rate') subtensorE = T.ivector('batch entities set') subtensorR = T.ivector('batch link set') # Generate the training positive and negative triples hpmat = S.dot(embedding.E, hp).T # batchsize x D dense matrix rpmat = S.dot(lembedding.E, rp).T tpmat = S.dot(embedding.E, tp).T hnmat = S.dot(embedding.E, hn).T tnmat = S.dot(embedding.E, tn).T # calculate the score pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat) negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat) negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat) costh, outh = margeCost(pos, negh, marge) costt, outt = margeCost(pos, negt, marge) embreg = regEmb(embedding, subtensorE, alpha) lembreg = regLink(lembedding, subtensorR, beta) cost = costh + costt + embreg[0] + lembreg out = T.concatenate([outh, outt]) outc = embreg[1] # list of inputs to the function list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR] # updating the embeddings using gradient descend emb_grad = T.grad(cost, embedding.E) New_embedding = embedding.E - lemb*emb_grad remb_grad = T.grad(cost, lembedding.E) New_rembedding = lembedding.E - lremb * remb_grad updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding}) return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg], updates=updates, on_unused_input='ignore')
def get_bivariate_normal_spec(): X1,X2,mu,sigma = [T.scalar('X1'),T.scalar('X2'), T.vector('mu'), T.matrix('sigma')] GaussianDensitySpec = FunctionSpec(variables=[X1, X2, mu, sigma], output_expression = -0.5*T.dot(T.dot((T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu).T, nlinalg.matrix_inverse(sigma)), (T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu))) return GaussianDensitySpec
def forward(self, x, hc): """ :param x: 1D: batch, 2D: self.n_in :param hc: 1D: batch, 2D: self.n_out * (self.order+1) :return: """ order, n_in, n_out, activation = self.order, self.n_in, self.n_out, self.activation layers = self.internal_layers if hc.ndim > 1: h_tm1 = hc[:, n_out*order:] else: h_tm1 = hc[n_out*order:] lst = [] for i in range(order): if hc.ndim > 1: c_i_tm1 = hc[:, n_out * i: n_out * i + n_out] else: c_i_tm1 = hc[n_out * i: n_out * i + n_out] if i == 0: c_i_t = layers[i].forward(x) else: c_i_t = c_im1_tm1 + layers[i].forward(x) lst.append(c_i_t) c_im1_tm1 = c_i_tm1 h_t = activation(c_i_t + self.bias) lst.append(h_t) if hc.ndim > 1: return T.concatenate(lst, axis=1) else: return T.concatenate(lst)
def recurrence( sample_z_t, sample_x_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec, mu_z_t, sigma_z_t, mu_x_tm1, sigma_x_tm1, v): if v is not None: v_hat = v - ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input r_t = T.concatenate( [v , v_hat], axis = 1 ) else: v_hat = mu_x_tm1 - ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input r_t = T.concatenate( [mu_x_tm1 , v_hat], axis = 1 ) # v_enc = [r_t, h_tm1_dec] v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1) #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc) i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc)) f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc)) c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc )) o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc)) h_t_enc = o_t_enc * T.tanh( c_t_enc ) # Get z_t mu_z_t = T.dot(h_t_enc, Wh_enc_mu_z ) + b_mu_z sigma_z_t = sigma_b + T.nnet.softplus(T.dot(h_t_enc, Wh_enc_sig_z ) + b_sig_z) #sample = theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX) z_t = mu_z_t + (sigma_z_t * (sample_z_t.reshape((batch_size,n_z))) ) # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec)) f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec)) c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec )) o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec)) h_t_dec = o_t_dec * T.tanh( c_t_dec ) # Get w_t mu_x_t = mu_x_tm1 + T.dot(h_t_dec, Wh_dec_mu_x) + b_mu_x sigma_x_t = sigma_b + T.nnet.softplus(T.dot(h_t_dec, Wh_dec_sig_x) + b_sig_x) return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec, mu_z_t, sigma_z_t, mu_x_t, sigma_x_t]
def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy): weightedC1= T.dot(relationProbs, self.C1.dimshuffle(1, 0)) weightedC2= T.dot(relationProbs, self.C2.dimshuffle(1, 0)) left1 = self.leftMostFactorization(batchSize=l, args=args1, wC1=weightedC1) right1 = self.rightMostFactorization(batchSize=l, args=args2, wC2=weightedC2) one = left1 + right1 u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]]) logScoresP = T.log(T.nnet.sigmoid(u)) allScores = logScoresP allScores = T.concatenate([allScores, entropy, entropy]) negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k)) negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k)) negative1 = self.negLeftMostFactorization(batchSize=l, negEmbed=negembed1, wC1=weightedC1) negative2 = self.negRightMostFactorization(batchSize=l, negEmbed=negembed2, wC2=weightedC2) negOne = negative1.dimshuffle(1, 0) + right1 negTwo = negative2.dimshuffle(1, 0) + left1 g = T.concatenate([negOne + self.Ab[neg1], negTwo + self.Ab[neg2]]) logScores = T.log(T.nnet.sigmoid(-g)) allScores = T.concatenate([allScores, logScores.flatten()]) return allScores
def forward(self, x, hc): """ :param x: the input vector or matrix :param hc: the vector/matrix of [ c_tm1, h_tm1 ], i.e. hidden state and visible state concatenated together :return: [ c_t, h_t ] as a single concatenated vector/matrix """ n_in, n_out, activation = self.n_in, self.n_out, self.activation if hc.ndim > 1: c_tm1 = hc[:, :n_out] h_tm1 = hc[:, n_out:] else: c_tm1 = hc[:n_out] h_tm1 = hc[n_out:] in_t = self.in_gate.forward(x, h_tm1) forget_t = self.forget_gate.forward(x, h_tm1) out_t = self.out_gate.forward(x, h_tm1) c_t = forget_t * c_tm1 + in_t * self.input_layer.forward(x, h_tm1) h_t = out_t * T.tanh(c_t) if hc.ndim > 1: return T.concatenate([c_t, h_t], axis=1) else: return T.concatenate([c_t, h_t])
def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy): argembed1 = self.A[args1] argembed2 = self.A[args2] weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) one = self.factorization(batchSize=l, argsEmbA=argembed1, argsEmbB=argembed2, wC=weightedC) # [l,n] u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]]) logScoresP = T.log(T.nnet.sigmoid(u)) allScores = logScoresP allScores = T.concatenate([allScores, entropy, entropy]) negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k)) negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k)) negOne = self.negFactorization1(batchSize=l, negEmbA=negembed1, argsEmbB=argembed2, wC=weightedC) negTwo = self.negFactorization2(batchSize=l, argsEmbA=argembed1, negEmbB=negembed2, wC=weightedC) g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0), negTwo + self.Ab[neg2].dimshuffle(1, 0)]) logScores = T.log(T.nnet.sigmoid(-g)) allScores = T.concatenate([allScores, logScores.flatten()]) return allScores
def filter_and_prob(inpt, transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov, initial_hidden, initial_hidden_cov): step = forward_step( transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov) hidden_mean_0 = T.zeros_like(hidden_noise_mean).dimshuffle('x', 0) hidden_cov_0 = T.zeros_like(hidden_noise_cov).dimshuffle('x', 0, 1) f0, F0, ll0 = step(inpt[0], hidden_mean_0, hidden_cov_0) replace = {hidden_noise_mean: initial_hidden, hidden_noise_cov: initial_hidden_cov} f0 = theano.clone(f0, replace) F0 = theano.clone(F0, replace) ll0 = theano.clone(ll0, replace) (f, F, ll), _ = theano.scan( step, sequences=inpt[1:], outputs_info=[f0, F0, None]) ll = ll.sum(axis=0) f = T.concatenate([T.shape_padleft(f0), f]) F = T.concatenate([T.shape_padleft(F0), F]) ll += ll0 return f, F, ll
def gru_layers(x, batch, n_fin, n_h, n_y, n_layers=1): params = [] for i in xrange(n_layers): if i == 0: layer = GRU(n_i=n_fin, n_h=n_h) layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W)) # h0: 1D: Batch, 2D: n_h h0 = T.zeros((batch, n_h), dtype=theano.config.floatX) else: layer = GRU(n_i=n_h * 2, n_h=n_h) # h: 1D: n_words, 2D: Batch, 3D n_h layer_input = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1] h0 = layer_input[0] xr = T.dot(layer_input, layer.W_xr) xz = T.dot(layer_input, layer.W_xz) xh = T.dot(layer_input, layer.W_xh) h, _ = theano.scan(fn=layer.forward, sequences=[xr, xz, xh], outputs_info=[h0]) params.extend(layer.params) layer = CRF(n_i=n_h * 2, n_h=n_y) params.extend(layer.params) h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W)) if n_layers % 2 == 0: emit = h[::-1] else: emit = h return params, layer, emit
def get_output_for(self, inputs, **kwargs): """ Updates stack given input, stack controls and output in the inputs array """ # unpack inputs input_val, prev_stack, controls = inputs assert input_val.ndim == 2 # cast shapes controls = controls.reshape([-1, 3, 1, 1]) input_val = insert_dim(input_val, 1) zeros_at_the_top = insert_dim(T.zeros_like(prev_stack[:, 0]), 1) # unpack controls a_push, a_pop, a_no_op = controls[:, 0], controls[:, 1], controls[:, 2] # a version of stack that is pushed down (push) stack_down = T.concatenate([prev_stack[:, 1:], zeros_at_the_top], axis=1) # a version of stack that is moved up (pop) stack_up = T.concatenate([input_val, prev_stack[:, :-1]], axis=1) # new stack new_stack = a_no_op * prev_stack + a_push * stack_up + a_pop * stack_down return new_stack
def _pad_blanks(queryseq, blank_symbol, queryseq_mask=None): """ Pad queryseq and corresponding queryseq_mask with blank symbol :param queryseq (L, B) :param queryseq_mask (L, B) :param blank_symbol scalar :return queryseq_padded, queryseq_mask_padded, both with shape (2L+1, B) """ # for queryseq queryseq_extended = queryseq.dimshuffle(1, 0, 'x') # (L, B) -> (B, L, 1) blanks = tensor.zeros_like(queryseq_extended) + blank_symbol # (B, L, 1) concat = tensor.concatenate([queryseq_extended, blanks], axis=2) # concat.shape = (B, L, 2) res = concat.reshape((concat.shape[0], concat.shape[1] * concat.shape[2])).T # res.shape = (2L, B), the reshape will cause the last 2 dimensions interlace begining_blanks = tensor.zeros((1, res.shape[1])) + blank_symbol # (1, B) queryseq_padded = tensor.concatenate([begining_blanks, res], axis=0) # (1+2L, B) # for queryseq_mask if queryseq_mask is not None: queryseq_mask_extended = queryseq_mask.dimshuffle(1, 0, 'x') # (L, B) -> (B, L, 1) concat = tensor.concatenate([queryseq_mask_extended, queryseq_mask_extended], axis=2) # concat.shape = (B, L, 2) res = concat.reshape((concat.shape[0], concat.shape[1] * concat.shape[2])).T begining_blanks = tensor.ones((1, res.shape[1]), dtype=floatX) queryseq_mask_padded = tensor.concatenate([begining_blanks, res], axis=0) else: queryseq_mask_padded = None return queryseq_padded, queryseq_mask_padded
def recur(self, ms_j, mt_jm1, mscut_j, mtcut_jm1, ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js ): # cnn encoding ngms_j, uttms_j = self.sCNN.encode(ms_j, mscut_j) ngmt_jm1,uttmt_jm1 = self.tCNN.encode(mt_jm1,mtcut_jm1) # padding dummy vector ngms_j = T.concatenate([ngms_j,T.zeros_like(ngms_j[-1:,:])],axis=0) ngmt_jm1 = T.concatenate([ngmt_jm1,T.zeros_like(ngmt_jm1[-1:,:])],axis=0) # source features ssrcemb_js = T.sum(ngms_j[ssrcpos_js,:],axis=0) vsrcemb_js = T.sum(ngms_j[vsrcpos_js,:],axis=0) src_js = T.concatenate([ssrcemb_js,vsrcemb_js,uttms_j],axis=0) # target features staremb_js = T.sum(ngmt_jm1[starpos_js,:],axis=0) vtaremb_js = T.sum(ngmt_jm1[vtarpos_js,:],axis=0) tar_js = T.concatenate([staremb_js,vtaremb_js,uttmt_jm1],axis=0) # update g_j g_j = T.dot( self.Whb, T.nnet.sigmoid( T.dot(src_js,self.Wfbs) + T.dot(tar_js,self.Wfbt) + self.B0)).dimshuffle('x') # update b_j g_j = T.concatenate([g_j,self.B],axis=0) b_j = T.nnet.softmax( g_j )[0,:] return b_j
def apply(self, source_sentence, source_sentence_mask): """Creates the final list of annotations. Args: source_sentence (Variable): Source sentence with words in vector representation. source_sentence_mask (Variable): Source mask Returns: Variable. source annotations """ # Time as first dimension base_representations,base_mask = self.base_encoder.apply( source_sentence, source_sentence_mask) annotations = [] masks = [] if self.add_direct: annotations.append(base_representations) masks.append(base_mask) for annotator in self.annotators: ann,mask = annotator.apply(base_representations, base_mask) annotations.append(ann) masks.append(mask) return tensor.concatenate(annotations), tensor.concatenate(masks)
def __init__(self, input_ngram, input_sm, vocab_size, emb_dim, num_section, linear_W_emb=None, fix_emb=False, nonlinear=None, activation=None): global rng global init_range if linear_W_emb is None: # random initialize linear_W_emb = np.asarray(rng.uniform( low=-init_range, high=init_range, size=(vocab_size, emb_dim)), dtype=theano.config.floatX) else: # use the given model parameter given_vocab_size, given_emb_dim = linear_W_emb.shape assert(given_vocab_size == vocab_size and given_emb_dim == emb_dim) # shared variables self.W_emb = theano.shared(value=linear_W_emb, name='W_emb') # stack vectors input_ngram = T.cast(input_ngram, 'int32') input_sm = T.cast(input_sm, 'int32') # output is a matrix where each row correponds to a context_size embedding vector, and row number equals to batch size # output dimensions: batch_size * ((context_size + 1) * emb_dim) output_local = self.W_emb[input_ngram[:, :-1].flatten()].reshape( (input_ngram.shape[0], emb_dim * (input_ngram.shape[1] - 1))) # self.W_emb.shape[1] sentence_lengths = input_sm[:,0] sentence_matrix = input_sm[:,1:] sentence_num = sentence_matrix.shape[0] global_length = sentence_matrix.shape[1] section_length = T.cast(T.ceil(global_length / float(num_section)), 'int32') # For the first section sentence_embeddings = T.mean(self.W_emb[sentence_matrix[:, :section_length].flatten()].reshape( (sentence_num, section_length, emb_dim)), axis=1) # For the rest sections for i in xrange(1, num_section): current_section = T.mean(self.W_emb[sentence_matrix[:, i*section_length:(i+1)*section_length].flatten()].reshape( (sentence_num, section_length, emb_dim)), axis=1) sentence_embeddings = T.concatenate([sentence_embeddings, current_section], axis=1) # get the sentence index for each ngram vector, and transform it to 0-based sentence_indeces = input_ngram[:,-1] base_index = sentence_indeces[0] sentence_indeces = sentence_indeces - base_index # the last column of output should be a weighted sum of the sentence # vectors output_global = sentence_embeddings[sentence_indeces.flatten()].reshape((sentence_indeces.shape[0], emb_dim * num_section)) # handle non-linear layer if nonlinear is None or activation is None: self.output = T.concatenate([output_local, output_global], axis=1) # params is the word embedding matrix self.params = [self.W_emb] if not fix_emb else [] else: self.non_linear_params, non_linear_output_global = addNonlinearLayer(output_global, emb_dim * num_section, nonlinear, activation) self.output = T.concatenate([output_local, non_linear_output_global], axis=1) self.params = [self.W_emb] + self.non_linear_params if not fix_emb else self.non_linear_params
def _join_global_RVs(global_RVs, global_order): if len(global_RVs) == 0: inarray_global = None uw_global = None replace_global = {} c_g = 0 else: joined_global = tt.concatenate([v.ravel() for v in global_RVs]) uw_global = tt.vector('uw_global') uw_global.tag.test_value = np.concatenate( [joined_global.tag.test_value, joined_global.tag.test_value] ) inarray_global = joined_global.type('inarray_global') inarray_global.tag.test_value = joined_global.tag.test_value # Replace RVs with reshaped subvectors of the joined vector # The order of global_order is the same with that of global_RVs subvecs = [reshape_t(inarray_global[slc], shp).astype(dtyp) for _, slc, shp, dtyp in global_order.vmap] replace_global = {v: subvec for v, subvec in zip(global_RVs, subvecs)} # Weight vector cs = [c for _, c in global_RVs.items()] oness = [tt.ones(v.ravel().tag.test_value.shape) for v in global_RVs] c_g = tt.concatenate([c * ones for c, ones in zip(cs, oness)]) return inarray_global, uw_global, replace_global, c_g
def _join_local_RVs(local_RVs, local_order): if len(local_RVs) == 0: inarray_local = None uw_local = None replace_local = {} c_l = 0 else: joined_local = tt.concatenate([v.ravel() for v in local_RVs]) uw_local = tt.vector('uw_local') uw_local.tag.test_value = np.concatenate([joined_local.tag.test_value, joined_local.tag.test_value]) inarray_local = joined_local.type('inarray_local') inarray_local.tag.test_value = joined_local.tag.test_value get_var = {var.name: var for var in local_RVs} replace_local = { get_var[var]: reshape_t(inarray_local[slc], shp).astype(dtyp) for var, slc, shp, dtyp in local_order.vmap } # Weight vector cs = [c for _, (_, c) in local_RVs.items()] oness = [tt.ones(v.ravel().tag.test_value.shape) for v in local_RVs] c_l = tt.concatenate([c * ones for c, ones in zip(cs, oness)]) return inarray_local, uw_local, replace_local, c_l
def get_unfolding_cost(self): ''' computes the unfolding rwconstructed cost (more than 2 inputs) ''' x = T.reshape(self.x, (-1, self.n_vector)) yi = x[0];i=1 for i in range(1, self.num): #while T.lt(i, self.num): xi = T.concatenate((yi, x[i])) yi = self.get_hidden_values(xi) i += 1 # Save the deepest hidden value as output vactor self.vector = copy.deepcopy(yi) tmp = [] i = 1 for i in range(1, self.num): #while T.lt(i, self.num): zi = self.get_reconstructed(yi) t = T.reshape(zi, (2, self.n_vector)) tmp.append(t[1]) yi = t[0] i += 1 tmp.append(yi) tmp.reverse() x = self.x z = T.concatenate(tmp) # cross-entropy cost should be modified here. L = -T.sum( (0.5*x+0.5)*T.log(0.5*z+0.5) + (-0.5*x+0.5)*T.log(-0.5*z+0.5) ) # squred cost. #L = -T.sum( (x-z)**2 ) cost = T.mean(L) + 0.01*(self.W**2).sum() # cost for a minibatch return cost
def diag_gauss(inpt): """Transfer function to turn an arary into sufficient statistics of a diagonal Gaussian. The first half of the input will be left unchanged, the second will be squared. the "split" into halves is performed along the second axis. Parameters ---------- inpt : Theano tensor Array of shape ``(n, d)`` or ``(t, n, d)``. Returns ------- output : Theano variable. Transformed input. Same shape as ``inpt``. """ half = inpt.shape[-1] // 2 if inpt.ndim == 3: mean, var = inpt[:, :, :half], inpt[:, :, half:] res = T.concatenate([mean, var ** 2 + 1e-8], axis=2) else: mean, var = inpt[:, :half], inpt[:, half:] res = T.concatenate([mean, var ** 2 + 1e-8], axis=1) return res
def recurrence( sample_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec, w_tm1, mew_t, sigma_t, v): v_hat = v - T.nnet.sigmoid(w_tm1) #error input r_t = T.concatenate( [v , v_hat], axis = 1 ) # v_enc = [r_t, h_tm1_dec] v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1) #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc) i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc)) f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc)) c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc )) o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc)) h_t_enc = o_t_enc * T.tanh( c_t_enc ) # Get z_t mew_t = T.dot(h_t_enc, Wh_enc_mew ) sigma_t = T.dot(h_t_enc, Wh_enc_sig ) #sample = theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX) z_t = mew_t + (T.exp(sigma_t) * sample_t ) # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec)) f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec)) c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec )) o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec)) h_t_dec = o_t_dec * T.tanh( c_t_dec ) # Get w_t w_t = w_tm1 + T.dot(h_t_dec, Wh_dec_w) return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec, w_t, mew_t, sigma_t]
def _build(det_dropout): all_out_probs = [] for encoding, lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.lstmstacks, encoded_melodies, relative_posns): activations = lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)), encoded_melody[:,:-1,:] ], 1), deterministic_dropout=det_dropout) out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) all_out_probs.append(out_probs) reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs) if self.normalize_artic_only: non_artic_probs = reduced_out_probs[:,:,:2] artic_probs = reduced_out_probs[:,:,2:] non_artic_sum = T.sum(non_artic_probs, 2, keepdims=True) artic_sum = T.sum(artic_probs, 2, keepdims=True) norm_artic_probs = artic_probs*(1-non_artic_sum)/artic_sum norm_out_probs = T.concatenate([non_artic_probs, norm_artic_probs], 2) else: normsum = T.sum(reduced_out_probs, 2, keepdims=True) normsum = T.maximum(normsum, constants.EPSILON) norm_out_probs = reduced_out_probs/normsum return Encoding.compute_loss(norm_out_probs, correct_notes, True)
def _setOutputs(self) : outs = [] for l in self.network.inConnections[self] : outs.append(l.outputs) self.outputs = tt.concatenate( outs, axis = 1 ) self.testOutputs = tt.concatenate( outs, axis = 1 )
def best_right_path_cost(pred, mask, token, blank): ''' best right path cost of multi sentences :param pred: (T, nb, voca_size+1) (4,1,3) :param mask: (nb, T) # :param pred_len: (nb,) pred_len of prediction (1) :param token: (nb, U) -1 for NIL (1,2) :param blank: (1) :return: best_right_path_cost (nb,) :return: argmin_token (nb, T) best path, -1 for null ''' pred_len = mask.sum(axis=-1).astype('int32') eps = theano.shared(np.float32(1e-35)) EPS = theano.shared(np.float32(35)) t = pred.shape[0] nb, U = token.shape[0], token.shape[1] token_len = T.sum(T.neq(token, -1), axis=-1) # token_with_blank token = token[:, :, None] # (nb, U, 1) token_with_blank = T.concatenate( (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape( (nb, 2 * U)) token_with_blank = T.concatenate( (token_with_blank, T.ones( (nb, 1), dtype=intX) * blank), axis=1) # (nb, 2*U+1) length = token_with_blank.shape[1] # only use these predictions pred = pred[:, T.tile(T.arange(nb), (length, 1)).T, token_with_blank] # (T, nb, 2U+1) pred = -T.log(pred + eps) # recurrence relation sec_diag = T.concatenate( (T.zeros((nb, 2), dtype=intX), T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])), axis=1) * T.neq(token_with_blank, blank) # (nb, 2U+1) recurrence_relation = T.tile( (T.eye(length) + T.eye(length, k=1)), (nb, 1, 1)) + T.tile(T.eye(length, k=2), (nb, 1, 1)) * sec_diag[:, None, :] # (nb, 2U+1, 2U+1) recurrence_relation = -T.log(recurrence_relation + eps).astype(floatX) # alpha alpha = T.ones_like(token_with_blank, dtype=floatX) * EPS alpha = T.set_subtensor(alpha[:, :2], pred[0, :, :2]) ################(nb, 2U+1) # dynamic programming # (T, nb, 2U+1) [log_probability, argmin_pos_1], _ = theano.scan(lambda curr, accum: ( (accum[:, :, None] + recurrence_relation).min(axis=1) + curr, (accum[:, :, None] + recurrence_relation).argmin(axis=1)), sequences=[pred[1:]], outputs_info=[alpha, None]) # why pred_len-2? labels_1 = log_probability[pred_len - 2, T.arange(nb), 2 * token_len - 1] # (nb,) labels_2 = log_probability[pred_len - 2, T.arange(nb), 2 * token_len] # (nb,) concat_labels = T.concatenate([labels_1[:, None], labels_2[:, None]], axis=-1) argmin_labels = concat_labels.argmin(axis=-1) cost = concat_labels.min(axis=-1) min_path = T.ones((t - 1, nb), dtype=intX) * -1 # -1 for null min_path = T.set_subtensor(min_path[pred_len - 2, T.arange(nb)], 2 * token_len - 1 + argmin_labels) # (T-1, nb) min_full_path, _ = theano.scan( lambda m_path, argm_pos, m_full_path: argm_pos[ T.arange(nb), T.maximum(m_path, m_full_path).astype('int32')].astype('int32'), sequences=[min_path[::-1], argmin_pos_1[::-1]], outputs_info=[min_path[-1]]) argmin_pos = T.concatenate((min_full_path[::-1], min_path[-1][None, :]), axis=0) # (T, nb) argmin_token = token_with_blank[T.arange(nb)[None, :], argmin_pos] return cost, (argmin_token.transpose((1, 0)) * mask + mask - 1).astype( 'int32' ) # alpha, log_probability, argmin_pos_1, argmin_labels, min_path, min_full_path, argmin_pos, token_with_blank, argmin_token
def bayes_estimate_cell(k, adm, eadm, coh, ecoh, alph=False, atype='joint'): """ Function to estimate the parameters of the flexural model at a single cell location of the input grids. :type k: :class:`~numpy.ndarray` :param k: 1D array of wavenumbers :type adm: :class:`~numpy.ndarray` :param adm: 1D array of wavelet admittance :type eadm: :class:`~numpy.ndarray` :param eadm: 1D array of error on wavelet admittance :type coh: :class:`~numpy.ndarray` :param coh: 1D array of wavelet coherence :type ecoh: :class:`~numpy.ndarray` :param ecoh: 1D array of error on wavelet coherence :type alph: bool, optional :param alph: Whether or not to estimate parameter ``alpha`` :type atype: str, optional :param atype: Whether to use the admittance (`'admit'`), coherence (`'coh'`) or both (`'joint'`) :return: (tuple): Tuple containing: * ``trace`` : :class:`~pymc3.backends.base.MultiTrace` Posterior samples from the MCMC chains * ``summary`` : :class:`~pandas.core.frame.DataFrame` Summary statistics from Posterior distributions * ``map_estimate`` : dict Container for Maximum a Posteriori (MAP) estimates """ with pm.Model() as model: # k is an array - needs to be passed as distribution k_obs = pm.Normal('k', mu=k, sigma=1., observed=k) # Prior distributions Te = pm.Uniform('Te', lower=1., upper=250.) F = pm.Uniform('F', lower=0., upper=0.9999) if alph: # Prior distribution of `alpha` alpha = pm.Uniform('alpha', lower=0., upper=np.pi) admit_exp, coh_exp = real_xspec_functions_alpha( k_obs, Te, F, alpha) else: admit_exp, coh_exp = real_xspec_functions_noalpha(k_obs, Te, F) # Select type of analysis to perform if atype == 'admit': # Uncertainty as observed distribution sigma = pm.Normal('sigma', mu=eadm, sigma=1., observed=eadm) # Likelihood of observations admit_obs = pm.Normal('admit_obs', mu=admit_exp, sigma=sigma, observed=adm) elif atype == 'coh': # Uncertainty as observed distribution sigma = pm.Normal('sigma', mu=ecoh, sigma=1., observed=ecoh) # Likelihood of observations coh_obs = pm.Normal('coh_obs', mu=coh_exp, sigma=sigma, observed=coh) elif atype == 'joint': # Define uncertainty as concatenated arrays ejoint = np.array([eadm, ecoh]).flatten() # Define array of observations and expected values as # concatenated arrays joint = np.array([adm, coh]).flatten() joint_exp = tt.flatten(tt.concatenate([admit_exp, coh_exp])) # Uncertainty as observed distribution sigma = pm.Normal('sigma', mu=ejoint, sigma=1., observed=ejoint) # Likelihood of observations joint_obs = pm.Normal('admit_coh_obs', mu=joint_exp, sigma=sigma, observed=joint) # Sample the Posterior distribution trace = pm.sample(cf.draws, tune=cf.tunes, cores=cf.cores) # Get Max a porteriori estimate map_estimate = pm.find_MAP() # Get Summary summary = pm.summary(trace) return trace, summary, map_estimate
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.num_units = rnn_num_units self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print "==> building network" example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) ######### network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var) print layers.get_output(network).eval({self.input_var:example}).shape # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print layers.get_output(network).eval({self.input_var:example}).shape network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False) print layers.get_output(network).eval({self.input_var:example}).shape if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) self.params = layers.get_all_params(network, trainable=True) output = layers.get_output(network) num_channels = 32 filter_W = 104 filter_H = 13 # NOTE: these constants are shapes of last pool layer, it can be symbolic # explicit values are better for optimizations channels = [] for channel_index in range(num_channels): channels.append(output[:, channel_index, :, :].transpose((0, 2, 1))) rnn_network_outputs = [] for channel_index in range(num_channels): rnn_input_var = channels[channel_index] # InputLayer network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var) # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # add params self.params += layers.get_all_params(network, trainable=True) rnn_network_outputs.append(layers.get_output(network)) all_output_var = T.concatenate(rnn_network_outputs, axis=1) print all_output_var.eval({self.input_var:example}).shape # InputLayer network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var) # DENSE 1 network = layers.DenseLayer(incoming=network, num_units=512, nonlinearity=rectify) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) if (self.dropout > 0): network = layers.dropout(network, self.dropout) print layers.get_output(network).eval({self.input_var:example}).shape # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var:example}).shape self.params += layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) #print "==> param shapes", [x.eval().shape for x in self.params] self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def make_backprop_scan(self, error_signal, extra_cost_inputs=None, compute_embedding_gradients=True): """ Args: error_signal: The external gradient d(cost)/d(stack top). A Theano batch of size `batch_size * model_dim`. """ assert hasattr(self, "stack_2_ptrs"), \ ("self._make_scan (forward pass) must be defined before " "self.make_backprop_scan is called") # We need to add extra updates to the `_zero_updates` member, so we # must be called before `_zero_updates` is read. assert self._zero is None, \ ("Can only install backprop on a fresh ThinStack. Don't call " "ThinStack.zero before setting up backprop.") if (compute_embedding_gradients and self._embedding_projection_network not in [None, util.IdentityLayer]): raise ValueError( "Do not support backprop for both an embedding projection " "layer and individual embeddings.") if self.use_input_batch_norm: raise ValueError( "Thin-stack backprop not supported with input batch-norm. Jon " "worked on BN gradients for 3 days without success, and then " "dropped it.") if extra_cost_inputs is None: extra_cost_inputs = [] wrt, f_proj_delta, f_push_delta, f_merge_delta = \ self._make_backward_graphs(extra_cost_inputs) wrt_shapes = [wrt_i.get_value().shape for wrt_i in wrt] # Build shared variables for accumulating wrt deltas. wrt_vars = [theano.shared(np.zeros(wrt_shape, dtype=np.float32), name=self._prefix + "bwd/wrt/%s" % wrt_i) for wrt_i, wrt_shape in zip(wrt, wrt_shapes)] # All of these need to be zeroed out in between batches. self._zero_updates += wrt_vars # Also accumulate embedding gradients separately if compute_embedding_gradients: dE = theano.shared(np.zeros(self.embeddings.get_value().shape, dtype=np.float32), name=self._prefix + "bwd/wrt/embeddings") self._zero_updates.append(dE) else: # Make dE a dummy variable. dE = T.zeros((1,)) # Useful batch zero-constants. zero_stack = T.zeros((self.batch_size, self.model_dim)) zero_extra_inps = [T.zeros((self.batch_size, extra_shape[-1])) for extra_shape in self.recurrence.extra_outputs] # Zero Jacobian matrices for masked reductions during backprop. May not # be used. zero_jac_wrts = [T.zeros((self.batch_size,) + wrt_shape) for wrt_shape in wrt_shapes] DUMMY = util.zeros_nobroadcast((1,)) batch_size = self.batch_size batch_range = T.arange(batch_size) stack_shift = T.cast(batch_range, theano.config.floatX) buffer_shift = T.cast(batch_range * self.seq_length, theano.config.floatX) def lookup(t_f, stack_fwd, stack_2_ptrs_t, buffer_cur_t, stack_bwd_t, extra_bwd): """Retrieve all relevant bwd inputs/outputs at time `t`.""" grad_cursor = t_f * batch_size + stack_shift main_grad = cuda_util.AdvancedSubtensor1Floats("B_maingrad")( stack_bwd_t, grad_cursor) extra_grads = tuple([ cuda_util.AdvancedSubtensor1Floats("B_extragrad_%i" % i)( extra_bwd_i, grad_cursor) for i, extra_bwd_i in enumerate(extra_bwd)]) # Find the timesteps of the two elements involved in the potential # merge at this timestep. t_c1 = (t_f - 1.0) * batch_size + stack_shift t_c2 = stack_2_ptrs_t # Find the two elements involved in the potential merge. c1 = cuda_util.AdvancedSubtensor1Floats("B_stack1")(stack_fwd, t_c1) c2 = cuda_util.AdvancedSubtensor1Floats("B_stack2")(stack_fwd, t_c2) buffer_top_t = cuda_util.AdvancedSubtensor1Floats("B_buffer_top")( self.buffer_t, buffer_cur_t + buffer_shift) # Retrieve extra inputs from auxiliary stack(s). extra_inps_t = tuple([ cuda_util.AdvancedSubtensor1Floats("B_extra_inp_%i" % i)( extra_inp_i, t_c1) for extra_inp_i in self.final_aux_stacks]) inputs = (c1, c2, buffer_top_t) + extra_inps_t grads = (main_grad,) + extra_grads return t_c1, t_c2, inputs, grads def step_b(# sequences t_f, transitions_t_f, stack_2_ptrs_t, buffer_cur_t, # accumulators dE, # rest (incl. outputs_info, non_sequences) *rest): # Separate the accum arguments from the non-sequence arguments. n_wrt = len(wrt_shapes) n_extra_bwd = len(self.recurrence.extra_outputs) wrt_deltas = rest[:n_wrt] stack_bwd_t = rest[n_wrt] extra_bwd = rest[n_wrt + 1:n_wrt + 1 + n_extra_bwd] id_buffer, stack_final = \ rest[n_wrt + 1 + n_extra_bwd:n_wrt + 1 + n_extra_bwd + 2] # At first iteration, drop the external error signal into the main # backward stack. stack_bwd_next = ifelse(T.eq(t_f, self.seq_length), T.set_subtensor(stack_bwd_t[-self.batch_size:], error_signal), stack_bwd_t) # Retrieve all relevant inputs/outputs at this timestep. t_c1, t_c2, inputs, grads = \ lookup(t_f, stack_final, stack_2_ptrs_t, buffer_cur_t, stack_bwd_next, extra_bwd) main_grad = grads[0] # Calculate deltas for this timestep. m_delta_inp, m_delta_wrt = f_merge_delta(inputs, grads) # NB: main_grad is not passed to push function. p_delta_inp, p_delta_wrt = f_push_delta(inputs, grads[1:]) # Check that delta function outputs match (at least in number). assert len(m_delta_inp) == len(p_delta_inp), \ "%i %i" % (len(m_delta_inp), len(p_delta_inp)) assert len(m_delta_wrt) == len(p_delta_wrt), \ "%i %i" % (len(m_delta_wrt), len(p_delta_wrt)) assert len(m_delta_inp) == 3 + len(self.aux_stacks), \ "%i %i" % (len(m_delta_inp), 3 + len(self.aux_stacks)) assert len(m_delta_wrt) == len(wrt) # Retrieve embedding indices on buffer at this timestep. # (Necessary for sending embedding gradients.) buffer_ids_t = cuda_util.AdvancedSubtensor1Floats("B_buffer_ids")( id_buffer, buffer_cur_t + buffer_shift) # Prepare masks for op-wise gradient accumulation. # TODO: Record actual transitions (e.g. for model 1S and higher) # and repeat those here mask = transitions_t_f masks = [mask, mask.dimshuffle(0, "x"), mask.dimshuffle(0, "x", "x")] # Insert gradients for the embedding projection network as well. if f_proj_delta is not None: # Look up raw buffer top for this timestep -- i.e., buffer top # *before* the op at this timestep was performed. This was the # input to the projection network at this timestep. proj_input = cuda_util.AdvancedSubtensor1Floats("B_raw_buffer_top")( self._raw_buffer_t, buffer_cur_t + buffer_shift) proj_inputs = (proj_input,) if self.use_input_dropout: embedding_dropout_mask = cuda_util.AdvancedSubtensor1Floats("B_buffer_dropout")( self._embedding_dropout_masks, buffer_cur_t + buffer_shift) proj_inputs = (proj_input, embedding_dropout_mask) # Compute separate graphs based on gradient from above. # NB: We discard the delta_inp return here. The delta_inp # should actually be passed back to the raw embedding # parameters, but we don't have any reason to support this in # practice. (Either we backprop to embeddings or project them # and learn the projection -- not both.) if m_delta_inp[2] is not None: _, m_proj_delta_wrt = f_proj_delta(proj_inputs, (m_delta_inp[2],)) m_delta_wrt = util.merge_update_lists(m_delta_wrt, m_proj_delta_wrt) # If we pushed (moved the buffer top onto the stack), the # gradient from above is a combination of the accumulated stack # gradient (main_grad) and any buffer top deltas from the push # function (e.g. tracking LSTM gradient). embedding_grad = main_grad if p_delta_inp[2] is not None: embedding_grad += p_delta_inp[2] _, p_proj_delta_wrt = f_proj_delta(proj_inputs, (embedding_grad,)) p_delta_wrt = util.merge_update_lists(p_delta_wrt, p_proj_delta_wrt) # Accumulate inp deltas, switching over push/merge decision. stacks = (stack_bwd_next, stack_bwd_next, (compute_embedding_gradients and dE) or None) cursors = (t_c1, t_c2, (compute_embedding_gradients and buffer_ids_t) or None) # Handle potential aux bwd stacks. stacks += extra_bwd cursors += ((t_c1,)) * len(extra_bwd) new_stacks = {} for stack, cursor, m_delta, p_delta in zip(stacks, cursors, m_delta_inp, p_delta_inp): if stack is None or cursor is None: continue elif m_delta is None and p_delta is None: # Disconnected gradient. continue base = new_stacks.get(stack, stack) mask_i = masks[(m_delta or p_delta).ndim - 1] if m_delta is None: delta = (1. - mask_i) * p_delta elif p_delta is None: delta = mask_i * m_delta else: delta = mask_i * m_delta + (1. - mask_i) * p_delta # Run subtensor update on associated structure using the # current cursor. new_stack = cuda_util.AdvancedIncSubtensor1Floats(inplace=True)( base, delta, cursor) new_stacks[stack] = new_stack # Accumulate wrt deltas, switching over push/merge decision. new_wrt_deltas = {} wrt_data = enumerate(zip(wrt, zero_jac_wrts, wrt_deltas, m_delta_wrt, p_delta_wrt)) for i, (wrt_var, wrt_zero, accum_delta, m_delta, p_delta) in wrt_data: if m_delta is None and p_delta is None: # Disconnected gradient. continue # Check that tensors returned by delta functions match shape # expectations. assert m_delta is None or accum_delta.ndim == m_delta.ndim - 1, \ "%s %i %i" % (wrt_var.name, accum_delta.ndim, m_delta.ndim) assert p_delta is None or accum_delta.ndim == p_delta.ndim - 1, \ "%s %i %i" % (wrt_var.name, accum_delta.ndim, p_delta.ndim) mask_i = masks[(m_delta or p_delta).ndim - 1] if m_delta is None: delta = T.switch(mask_i, wrt_zero, p_delta) elif p_delta is None: delta = T.switch(mask_i, m_delta, wrt_zero) else: delta = T.switch(mask_i, m_delta, p_delta) # TODO: Is this at all efficient? (Bring back GPURowSwitch?) delta = delta.sum(axis=0) # TODO: we want this to be inplace new_wrt_deltas[accum_delta] = accum_delta + delta # On push ops, backprop the stack_bwd error onto the embedding # projection network / embedding parameters. # TODO make sparse? if compute_embedding_gradients: new_stacks[dE] = cuda_util.AdvancedIncSubtensor1Floats(inplace=True)( new_stacks.get(dE, dE), (1. - masks[1]) * main_grad, buffer_ids_t) updates = dict(new_wrt_deltas.items() + new_stacks.items()) updates = util.prepare_updates_dict(updates) return updates # TODO: These should come from forward pass -- not fixed -- in model # 1S, etc. transitions_f = T.cast(self.transitions.dimshuffle(1, 0), dtype=theano.config.floatX) ts_f = T.cast(T.arange(1, self.seq_length + 1), dtype=theano.config.floatX) # Representation of buffer using embedding indices rather than values id_buffer = T.cast(self.X.flatten(), theano.config.floatX) # Build sequence of buffer pointers, where buf_ptrs[i] indicates the # buffer pointer values *before* computation at timestep *i* proceeds. # (This means we need to slice off the last actual buf_ptr output and # prepend a dummy.) buf_ptrs = T.concatenate([T.zeros((1, batch_size,)), self.buf_ptrs[:-1]], axis=0) sequences = [ts_f, transitions_f, self.stack_2_ptrs, buf_ptrs] outputs_info = [] # Shared variables: Accumulated wrt deltas and bwd stacks. non_sequences = [dE] + wrt_vars non_sequences += [self.stack_bwd] + self.aux_bwd_stacks # More auxiliary data non_sequences += [id_buffer, self.final_stack] # More helpers (not referenced directly in code, but we need to include # them as non-sequences to satisfy scan strict mode) aux_data = [self.stack, self.buffer_t] + self.aux_stacks + self.final_aux_stacks aux_data += [self.X, self.transitions, self._raw_buffer_t] if self.use_input_dropout: aux_data.append(self._embedding_dropout_masks) aux_data += self._vs.vars.values() + extra_cost_inputs if self.premise_stack_tops: aux_data.append(self.premise_stack_tops) non_sequences += list(set(aux_data)) bscan_ret, self.bscan_updates = theano.scan( step_b, sequences, outputs_info, non_sequences, go_backwards=True, n_steps=self.seq_length, # strict=True, name=self._prefix + "stack_bwd") self.gradients = {wrt_i: self.bscan_updates.get(wrt_var) for wrt_i, wrt_var in zip(wrt, wrt_vars)} if compute_embedding_gradients: self.embedding_gradients = self.bscan_updates[dE]
def get_output_for(self, input, **kwargs): x, y = input if y.ndim == 1: y = T.extra_ops.to_one_hot(y, self.num_cls) assert y.ndim == 2 return T.concatenate([x, y], axis=1)
def c_6layer_mnist_imputation(seed=0, ctype='cva', pertub_type=3, pertub_prob=6, pertub_prob1=14, visualization_times=20, denoise_times=200, predir=None, n_batch=144, dataset='mnist.pkl.gz', batch_size=500): """ Missing data imputation """ #cp->cd->cpd->cd->c nkerns = [32, 32, 64, 64, 64] drops = [0, 0, 0, 0, 0, 1] #skerns=[5, 3, 3, 3, 3] #pools=[2, 1, 1, 2, 1] #modes=['same']*5 n_hidden = [500, 50] drop_inverses = [ 1, ] # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28 if dataset == 'mnist.pkl.gz': dim_input = (28, 28) colorImg = False logdir = 'results/imputation/' + ctype + '/mnist/' + ctype + '_6layer_mnist_' + str( pertub_type) + '_' + str(pertub_prob) + '_' + str( pertub_prob1) + '_' + str(denoise_times) + '_' logdir += str(int(time.time())) + '/' if not os.path.exists(logdir): os.makedirs(logdir) print predir with open(logdir + 'hook.txt', 'a') as f: print >> f, predir train_set_x, test_set_x, test_set_x_pertub, pertub_label, pertub_number = datapy.load_pertub_data( dirs='data_imputation/', pertub_type=pertub_type, pertub_prob=pertub_prob, pertub_prob1=pertub_prob1) datasets = datapy.load_data_gpu(dataset, have_matrix=True) _, _, _ = datasets[0] valid_set_x, _, _ = datasets[1] _, _, _ = datasets[2] # compute number of minibatches for training, validation and testing n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') x_pertub = T.matrix( 'x_pertub') # the data is presented as rasterized images p_label = T.matrix('p_label') random_z = T.matrix('random_z') drop = T.iscalar('drop') drop_inverse = T.iscalar('drop_inverse') activation = nonlinearity.relu rng = np.random.RandomState(seed) rng_share = theano.tensor.shared_randomstreams.RandomStreams(0) input_x = x_pertub.reshape((batch_size, 1, 28, 28)) recg_layer = [] cnn_output = [] #1 recg_layer.append( ConvMaxPool.ConvMaxPool(rng, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2), border_mode='valid', activation=activation)) if drops[0] == 1: cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share)) else: cnn_output.append(recg_layer[-1].output(input=input_x)) #2 recg_layer.append( ConvMaxPool.ConvMaxPool(rng, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(1, 1), border_mode='same', activation=activation)) if drops[1] == 1: cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share)) else: cnn_output.append(recg_layer[-1].output(cnn_output[-1])) #3 recg_layer.append( ConvMaxPool.ConvMaxPool(rng, image_shape=(batch_size, nkerns[1], 12, 12), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2), border_mode='valid', activation=activation)) if drops[2] == 1: cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share)) else: cnn_output.append(recg_layer[-1].output(cnn_output[-1])) #4 recg_layer.append( ConvMaxPool.ConvMaxPool(rng, image_shape=(batch_size, nkerns[2], 5, 5), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(1, 1), border_mode='same', activation=activation)) if drops[3] == 1: cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share)) else: cnn_output.append(recg_layer[-1].output(cnn_output[-1])) #5 recg_layer.append( ConvMaxPool.ConvMaxPool(rng, image_shape=(batch_size, nkerns[3], 5, 5), filter_shape=(nkerns[4], nkerns[3], 3, 3), poolsize=(1, 1), border_mode='same', activation=activation)) if drops[4] == 1: cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share)) else: cnn_output.append(recg_layer[-1].output(cnn_output[-1])) mlp_input_x = cnn_output[-1].flatten(2) activations = [] #1 recg_layer.append( FullyConnected.FullyConnected(rng=rng, n_in=5 * 5 * nkerns[-1], n_out=n_hidden[0], activation=activation)) if drops[-1] == 1: activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share)) else: activations.append(recg_layer[-1].output(input=mlp_input_x)) #stochastic layer recg_layer.append( GaussianHidden.GaussianHidden(rng=rng, input=activations[-1], n_in=n_hidden[0], n_out=n_hidden[1], activation=None)) z = recg_layer[-1].sample_z(rng_share) gene_layer = [] z_output = [] random_z_output = [] #1 gene_layer.append( FullyConnected.FullyConnected(rng=rng, n_in=n_hidden[1], n_out=n_hidden[0], activation=activation)) z_output.append(gene_layer[-1].output(input=z)) random_z_output.append(gene_layer[-1].output(input=random_z)) #2 gene_layer.append( FullyConnected.FullyConnected(rng=rng, n_in=n_hidden[0], n_out=5 * 5 * nkerns[-1], activation=activation)) if drop_inverses[0] == 1: z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share)) random_z_output.append(gene_layer[-1].drop_output( input=random_z_output[-1], drop=drop_inverse, rng=rng_share)) else: z_output.append(gene_layer[-1].output(input=z_output[-1])) random_z_output.append( gene_layer[-1].output(input=random_z_output[-1])) input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5)) input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5)) #1 gene_layer.append( UnpoolConvNon.UnpoolConvNon(rng, image_shape=(batch_size, nkerns[-1], 5, 5), filter_shape=(nkerns[-2], nkerns[-1], 3, 3), poolsize=(1, 1), border_mode='same', activation=activation)) z_output.append(gene_layer[-1].output(input=input_z)) random_z_output.append(gene_layer[-1].output_random_generation( input=input_random_z, n_batch=n_batch)) #2 gene_layer.append( UnpoolConvNon.UnpoolConvNon(rng, image_shape=(batch_size, nkerns[-2], 5, 5), filter_shape=(nkerns[-3], nkerns[-2], 3, 3), poolsize=(2, 2), border_mode='full', activation=activation)) z_output.append(gene_layer[-1].output(input=z_output[-1])) random_z_output.append(gene_layer[-1].output_random_generation( input=random_z_output[-1], n_batch=n_batch)) #3 gene_layer.append( UnpoolConvNon.UnpoolConvNon(rng, image_shape=(batch_size, nkerns[-3], 12, 12), filter_shape=(nkerns[-4], nkerns[-3], 3, 3), poolsize=(1, 1), border_mode='same', activation=activation)) z_output.append(gene_layer[-1].output(input=z_output[-1])) random_z_output.append(gene_layer[-1].output_random_generation( input=random_z_output[-1], n_batch=n_batch)) #4 gene_layer.append( UnpoolConvNon.UnpoolConvNon(rng, image_shape=(batch_size, nkerns[-4], 12, 12), filter_shape=(nkerns[-5], nkerns[-4], 3, 3), poolsize=(1, 1), border_mode='same', activation=activation)) z_output.append(gene_layer[-1].output(input=z_output[-1])) random_z_output.append(gene_layer[-1].output_random_generation( input=random_z_output[-1], n_batch=n_batch)) #5 stochastic layer # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli gene_layer.append( UnpoolConvNon.UnpoolConvNon(rng, image_shape=(batch_size, nkerns[-5], 12, 12), filter_shape=(1, nkerns[-5], 5, 5), poolsize=(2, 2), border_mode='full', activation=nonlinearity.sigmoid)) z_output.append(gene_layer[-1].output(input=z_output[-1])) random_z_output.append(gene_layer[-1].output_random_generation( input=random_z_output[-1], n_batch=n_batch)) gene_layer.append( NoParamsBernoulliVisiable.NoParamsBernoulliVisiable( #rng=rng, #mean=z_output[-1], #data=input_x, )) logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x) # 4-D tensor of random generation random_x_mean = random_z_output[-1] random_x = gene_layer[-1].sample_x(rng_share, random_x_mean) x_denoised = z_output[-1].flatten(2) x_denoised = p_label * x + (1 - p_label) * x_denoised mse = ((x - x_denoised)**2).sum() / pertub_number params = [] for g in gene_layer: params += g.params for r in recg_layer: params += r.params train_activations = theano.function( inputs=[index], outputs=T.concatenate(activations, axis=1), givens={ x_pertub: train_set_x[index * batch_size:(index + 1) * batch_size], drop: np.cast['int32'](0) }) valid_activations = theano.function( inputs=[index], outputs=T.concatenate(activations, axis=1), givens={ x_pertub: valid_set_x[index * batch_size:(index + 1) * batch_size], drop: np.cast['int32'](0) }) test_activations = theano.function(inputs=[x_pertub], outputs=T.concatenate(activations, axis=1), givens={drop: np.cast['int32'](0)}) imputation_model = theano.function( inputs=[index, x_pertub], outputs=[x_denoised, mse], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], p_label: pertub_label[index * batch_size:(index + 1) * batch_size], drop: np.cast['int32'](0), drop_inverse: np.cast['int32'](0) }) ################## # Pretrain MODEL # ################## model_epoch = 600 if os.environ.has_key('model_epoch'): model_epoch = int(os.environ['model_epoch']) if predir is not None: color.printBlue('... setting parameters') color.printBlue(predir) if model_epoch == -1: pre_train = np.load(predir + 'best-model.npz') else: pre_train = np.load(predir + 'model-' + str(model_epoch) + '.npz') pre_train = pre_train['model'] if ctype == 'cva': for (para, pre) in zip(params, pre_train): para.set_value(pre) elif ctype == 'cmmva': for (para, pre) in zip(params, pre_train[:-2]): para.set_value(pre) else: exit() else: exit() ############### # TRAIN MODEL # ############### print '... training' epoch = 0 n_visualization = 100 output = np.ones((n_visualization, visualization_times + 2, 784)) output[:, 0, :] = test_set_x.get_value()[:n_visualization, :] output[:, 1, :] = test_set_x_pertub.get_value()[:n_visualization, :] image = paramgraphics.mat_to_img(output[:, 0, :].T, dim_input, colorImg=colorImg) image.save(logdir + 'data.png', 'PNG') image = paramgraphics.mat_to_img(output[:, 1, :].T, dim_input, colorImg=colorImg) image.save(logdir + 'data_pertub.png', 'PNG') tmp = test_set_x_pertub.get_value() while epoch < denoise_times: epoch = epoch + 1 this_mse = 0 for i in xrange(n_test_batches): d, m = imputation_model(i, tmp[i * batch_size:(i + 1) * batch_size]) tmp[i * batch_size:(i + 1) * batch_size] = np.asarray(d) this_mse += m if epoch <= visualization_times: output[:, epoch + 1, :] = tmp[:n_visualization, :] print epoch, this_mse with open(logdir + 'hook.txt', 'a') as f: print >> f, epoch, this_mse image = paramgraphics.mat_to_img(tmp[:n_visualization, :].T, dim_input, colorImg=colorImg) image.save(logdir + 'procedure-' + str(epoch) + '.png', 'PNG') np.savez(logdir + 'procedure-' + str(epoch), tmp=tmp) image = paramgraphics.mat_to_img((output.reshape(-1, 784)).T, dim_input, colorImg=colorImg, tile_shape=(n_visualization, 22)) image.save(logdir + 'output.png', 'PNG') np.savez(logdir + 'output', output=output) # save original train features and denoise test features for i in xrange(n_train_batches): if i == 0: train_features = np.asarray(train_activations(i)) else: train_features = np.vstack( (train_features, np.asarray(train_activations(i)))) for i in xrange(n_valid_batches): if i == 0: valid_features = np.asarray(valid_activations(i)) else: valid_features = np.vstack( (valid_features, np.asarray(valid_activations(i)))) for i in xrange(n_test_batches): if i == 0: test_features = np.asarray( test_activations(tmp[i * batch_size:(i + 1) * batch_size])) else: test_features = np.vstack( (test_features, np.asarray( test_activations(tmp[i * batch_size:(i + 1) * batch_size])))) np.save(logdir + 'train_features', train_features) np.save(logdir + 'valid_features', valid_features) np.save(logdir + 'test_features', test_features)
def exprs(inpt_mean, inpt_var, in_to_hidden, hidden_to_hiddens, hidden_to_out, hidden_biases, hidden_var_scales_sqrt, initial_hiddens, recurrents, out_bias, out_var_scale_sqrt, hidden_transfers, out_transfer, in_to_out=None, skip_to_outs=None, p_dropouts=None, hotk_inpt=False): """Return a dictionary containing Theano expressions for various components of a recurrent network with variance propagation. Parameters ---------- inpt_mean : Theano variable Represents the mean of the input sequences as a sequence tensor. inpt_var : Theano variable Representes the variance of the input sequences as a sequence tensor Can a be a scalar as well. (E.g. 1e-8 if no variance is desired at this point.) in_to_hidden : Theano variable Matrix representing the map from input to the first hidden layer. hidden_to_hiddens : list of Theano variables List of matrices representing the maps between the hiddens. hidden_to_out : Theano variable Matrix representing the map from the last hidden layer to the output layer. hidden_biases : list of Theano variables Biases for the hidden layers. hidden_var_scales_sqrt : Theano variable Biases for the variances. See ``forward_layer`` for an exact description of what it does. initial_hiddens : list of Theano variables List of vectors representing the initial hidden states. recurrents : list of Theano variables List of matrices representing the recurrent weight matrices. out_bias : Theano variable Bias vector of the output layer. hidden_transfers : list of funtions or strings List of transfer functions for the layers. Each element is either a function that given a mean and a variance sequence tensor produces equally shaped mean and variance tensors or a string pointing to a function in ``breze.arch.component.varprop.transfer``. out_transfer : Theano variable Function or string of the form described for ``hidden_transfers``. p_dropouts : List of scalars Each element in this list represents the probability to drop out an individual unit in the corresponding layer. The list should contain N+1 items, where N is the number of hidden layers. If N+2 items are contained, the last element is used to drop out units from hidden to out, while the one before is used to drop out units from hidden to hidden. Returns ------- exprs : dictionary Map of strings to Theano expressions. Keys are: - ``hidden_in_mean_*``: pre-synaptic mean of layer, - ``hidden_in_var_0``: pre-synaptic variance of layer, - ``hidden_mean_0``: post-synaptic mean of layer, - ``hidden_var_0``: post-synaptic variance of layer, - ``inpt_mean``: mean of the input, - ``inpt_var``: variance of the input - ``output_in_mean``: pre-synaptic mean of output, - ``output_in_var``: pre-synptic variance of output, - ``output_mean``: post-synaptic mean of output, - ``output_var``: post-synaptic variance of output, - ``output``: concatenation of mean and variance of output """ # TODO add skip to outs docs # TODO: add pooling # TODO: add leaky integration exprs = {} f_hiddens = [lookup(i, transfer) for i in hidden_transfers] f_output = lookup(out_transfer, transfer) if inpt_var.ndim != 3: # Scalar inpt_var = T.ones_like(inpt_mean) * inpt_var if hotk_inpt: hmi, hvi, hmo, hvo = int_forward_layer(inpt_mean, inpt_var, in_to_hidden, hidden_biases[0], hidden_var_scales_sqrt[0], f_hiddens[0], p_dropouts[0]) else: hmi, hvi, hmo, hvo = forward_layer(inpt_mean, inpt_var, in_to_hidden, hidden_biases[0], hidden_var_scales_sqrt[0], f_hiddens[0], p_dropouts[0]) hmi_rec, hvi_rec, hmo_rec, hvo_rec = recurrent_layer( hmi, hvi, recurrents[0], f_hiddens[0], initial_hiddens[0], p_dropouts[1]) exprs.update({ 'hidden_in_mean_0': hmi_rec, 'hidden_in_var_0': hvi_rec, 'hidden_mean_0': hmo_rec, 'hidden_var_0': hvo_rec }) zipped = zip(hidden_to_hiddens, hidden_biases[1:], hidden_var_scales_sqrt[1:], recurrents[1:], f_hiddens[1:], initial_hiddens[1:], p_dropouts[1:]) for i, (w, b, vb, r, t, j, d) in enumerate(zipped): hmo_rec_m1, hvo_rec_m1 = hmo_rec, hvo_rec hmi, hvi, hmo, hvo = forward_layer(hmo_rec_m1, hvo_rec_m1, w, b, vb, t, d) hmi_rec, hvi_rec, hmo_rec, hvo_rec = recurrent_layer( hmi, hvi, r, t, j, d) exprs.update({ 'hidden_in_mean_%i' % (i + 1): hmi, 'hidden_in_var_%i' % (i + 1): hvi, 'hidden_mean_%i' % (i + 1): hmo, 'hidden_var_%i' % (i + 1): hvo }) output_in_mean, output_in_var, _, _ = forward_layer( hmo_rec, hvo_rec, hidden_to_out, out_bias, hidden_var_scales_sqrt[-1], lambda x, y: (x, y), p_dropouts[-1]) if in_to_out is not None: output_mean_inc, output_var_inc, _, _ = forward_layer( inpt_mean, inpt_var, in_to_out, T.zeros_like(out_bias), T.ones_like(out_bias), lambda x, y: (x, y), p_dropouts[0]) output_in_mean += output_mean_inc output_in_var += output_var_inc if skip_to_outs is not None: for i, s in enumerate(skip_to_outs): output_mean_inc, output_var_inc, _, _ = forward_layer( exprs['hidden_mean_%i' % i], exprs['hidden_var_%i' % i], s, T.zeros_like(out_bias), T.ones_like(out_bias), lambda x, y: (x, y), p_dropouts[i + 1]) output_in_mean += output_mean_inc output_in_var += output_var_inc output_mean, output_var = f_output(output_in_mean, output_in_var) # TODO: raise not implemented for out scale exprs.update({ 'inpt_mean': inpt_mean, 'inpt_var': inpt_var, 'output_in_mean': output_in_mean, 'output_in_var': output_in_var, 'output_mean': output_mean, 'output_var': output_var, 'output': T.concatenate([output_mean, output_var], axis=2), }) return exprs
def lstm_decoder_layer(tparams_all, input_state, options, maxlen, dp, prefix="lstm_decoder_layer"): tparams_d = tparams_all[0] tparams_g = tparams_all[1] #rng = numpy.random.RandomState(4567) trng = RandomStreams(SEED) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(x_, m_, h_, c_): preact = tensor.dot(x_, tparams_g[_p(prefix, 'W')]) + tparams_g[_p(prefix, 'b')] + \ tensor.dot(h_, tparams_g[_p(prefix, 'U')]) i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')])) f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')])) o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')])) c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')])) c = f * c_ + i * c h = o * tensor.tanh(c) s = tensor.nnet.softmax(tensor.dot(h, tparams_g['to_idx_emb'])) #x_t = tensor.dot((s / s.max(axis=1)[:,None]).astype('int32').astype(theano.config.floatX), tparams_d['Wemb']) x_t = tensor.dot(tensor.switch(s < s.max(axis=1)[:,None], 0.0, 1.0).astype(theano.config.floatX), tparams_d['Wemb']) x_out = s.argmax(axis=1) m = tensor.switch(tensor.eq(x_out, 10), 0.0, 1.0).astype(theano.config.floatX) * m_ #x_t = tensor.dot(h_, tparams[_p(prefix, 'W_x')]) + tparams[_p(prefix, 'b_x')] return x_out, x_t, m, h, c ############################################################################################## rval, updates = theano.scan(_step, outputs_info=[None, input_state, tensor.alloc(numpy_floatX(1.), input_state.shape[0]), tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n']), tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n'])], name=_p(prefix, '_layers'), n_steps=maxlen) #proj_0 = rval[1]#tensor.tanh(rval[0]) m22 = trng.binomial(size=(input_state.shape[0],), p=dp, n=1, dtype=theano.config.floatX) #return rval[0]*m2, rval[1]*m2[:,None], rval[2]*m2 if(tensor.gt(maxlen, 4) == 1): x2 = tensor.alloc(numpy.asarray(0, dtype='int32'), maxlen - 4, input_state.shape[0]) x2 = tensor.concatenate((tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(7, dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(10, dtype='int32'), input_state.shape[0])[None, :], x2), axis=0) m2 = tensor.alloc(numpy_floatX(0.), maxlen - 3, input_state.shape[0]) m2 = tensor.concatenate((tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], m2), axis=0) xt2 = tparams_d['Wemb'][x2] return rval[0]*m22+x2*(1-m22), rval[1]*m22[:,None]+xt2*(1-m22[:,None]), rval[2]*m22+m2*(1-m22) else: return rval[0]*m22, rval[1]*m22[:,None], rval[2]*m22
def __init__(self, We, params): lstm_layers_num = 1 en_hidden_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform(self.de_hidden_size + 2 * en_hidden_size, self.num_labels), borrow=True) self.hidden_decode = theano.shared(name="Hidden to Decode", value=init_xavier_uniform( 2 * en_hidden_size, self.de_hidden_size), borrow=True) self.hidden_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0., dtype=theano.config.floatX), borrow=True) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [self.linear, self.de_lookuptable ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.en_hidden_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(self.en_hidden_size) enclstm_b = LSTM(self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the encoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) state_below = tensor.nnet.softmax(newpred) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0 })
def Build_Model(tparams_all, options): trng = RandomStreams(SEED) # Discriminator x0 = tensor.matrix('x0', dtype='int32') #SIP Path x1 = tensor.matrix('x1', dtype='int32') #RelSrc Path x3 = tensor.matrix('x3', dtype='int32') #Cue Path mask0 = tensor.matrix('mask0', dtype=config.floatX) mask1 = tensor.matrix('mask1', dtype=config.floatX) mask3 = tensor.matrix('mask3', dtype=config.floatX) x0_d_y_fake = tensor.vector('x0_d_y_fake', dtype='int32') x1_d_y_fake = tensor.vector('x1_d_y_fake', dtype='int32') x3_d_y_fake = tensor.vector('x3_d_y_fake', dtype='int32') y0 = tensor.vector('y0', dtype='int32') y1 = tensor.vector('y1', dtype='int32') # Generator x_noise_0 = tensor.matrix('x_noise_0', dtype=config.floatX) x_noise_1 = tensor.matrix('x_noise_1', dtype=config.floatX) x_noise_3 = tensor.matrix('x_noise_3', dtype=config.floatX) #x0_g_y_fake = tensor.vector('x0_g_fake', dtype='int32') #x1_g_y_fake = tensor.vector('x1_g_fake', dtype='int32') #x3_g_y_fake = tensor.vector('x3_g_fake', dtype='int32') maxlen_0 = tensor.scalar(name='maxlen_0', dtype='int32') maxlen_1 = tensor.scalar(name='maxlen_1', dtype='int32') maxlen_3 = tensor.scalar(name='maxlen_3', dtype='int32') ################### dropout_ratio = tensor.scalar(name='dropout_ratio') dropout_decay_ratio = tensor.scalar(name='dropout_decay_ratio') tparams_d = tparams_all[0] tparams_g = tparams_all[1] ##################################### # Discriminator p_0 = lstm_layer(tparams_d, input_state=tparams_d['Wemb'][x0], mask=mask0, options=options) p_1 = lstm_layer(tparams_d, input_state=tparams_d['Wemb'][x1], mask=mask1, options=options) p_3 = lstm_layer(tparams_d, input_state=tparams_d['Wemb'][x3[2:,:]], mask=mask3, options=options) proj_0 = tensor.concatenate((p_0, p_1), axis=1) proj_1 = tensor.concatenate((tparams_d['CueTemb'][x3[0, :]], tparams_d['Lemb'][x3[1, :]], p_3), axis=1) proj_0 = proj_0 * dropout_mask_1D(proj_0, 1, dropout_ratio, trng) * dropout_decay_ratio proj_1 = proj_1 * dropout_mask_1D(proj_1, 1, dropout_ratio, trng) * dropout_decay_ratio pred_0 = tensor.nnet.softmax(tensor.dot(proj_0, tparams_d['Ws0']) + tparams_d['bs0']) pred_1 = tensor.nnet.softmax(tensor.dot(proj_1, tparams_d['Ws1']) + tparams_d['bs1']) x0_d_fake_pred = tensor.nnet.softmax(tensor.dot(p_0, tparams_d['Ws_fake'])) x1_d_fake_pred = tensor.nnet.softmax(tensor.dot(p_1, tparams_d['Ws_fake'])) x3_d_fake_pred = tensor.nnet.softmax(tensor.dot(p_3, tparams_d['Ws_fake'])) f_D_pred_prob = theano.function(inputs=[x0, x1, x3, mask0, mask1, mask3, dropout_ratio, dropout_decay_ratio], outputs=[pred_0.max(axis=1), pred_1.max(axis=1)], name='f_D_pred_prob') f_D_pred = theano.function(inputs=[x0, x1, x3, mask0, mask1, mask3, dropout_ratio, dropout_decay_ratio], outputs=[pred_0.argmax(axis=1), pred_1.argmax(axis=1)], name='f_D_pred') off = 1e-8 d_cost = - 1./3.*tensor.mean(tensor.log(x0_d_fake_pred[tensor.arange(x0_d_y_fake.shape[0]), x0_d_y_fake] + off)) + \ - 1./3.*tensor.mean(tensor.log(x1_d_fake_pred[tensor.arange(x1_d_y_fake.shape[0]), x1_d_y_fake] + off)) + \ - 1./3.*tensor.mean(tensor.log(x3_d_fake_pred[tensor.arange(x3_d_y_fake.shape[0]), x3_d_y_fake] + off)) + \ - 1./2.*tensor.mean(tensor.log(pred_0[tensor.arange(y0.shape[0]), y0] + off)) + \ - 1./2.*tensor.mean(tensor.log(pred_1[tensor.arange(y1.shape[0]), y1] + off)) ############################################################## # Generator xn_0 = x_noise_0 * tparams_g['label_emb_0'][y0] * tparams_g['label_emb_1'][y1] xn_1 = x_noise_1 * tparams_g['label_emb_0'][y0] * tparams_g['label_emb_1'][y1] xn_3 = x_noise_3 * tparams_g['label_emb_0'][y0] * tparams_g['label_emb_1'][y1] x0_g, x0_g_emb, x0_g_mask = lstm_decoder_layer(tparams_all, xn_0, options, maxlen_0, 0.9) x1_g, x1_g_emb, x1_g_mask = lstm_decoder_layer(tparams_all, xn_1, options, maxlen_1, 0.9) x3_g, x3_g_emb, x3_g_mask = lstm_decoder_layer(tparams_all, xn_3, options, maxlen_3, 0.7) p_g0 = lstm_layer(tparams_d, input_state=x0_g_emb, mask=x0_g_mask, options=options) p_g1 = lstm_layer(tparams_d, input_state=x1_g_emb, mask=x1_g_mask, options=options) p_g3 = lstm_layer(tparams_d, input_state=x3_g_emb, mask=x3_g_mask, options=options) f_G_produce = theano.function(inputs=[x_noise_0, x_noise_1, x_noise_3, maxlen_0, maxlen_1, maxlen_3, y0, y1], outputs=[x0_g.astype('int32'), x1_g.astype('int32'), x3_g.astype('int32'), x0_g_mask, x1_g_mask, x3_g_mask], name='f_G_produce') g_cost = (((p_0 - p_g0)**2).sum(axis=1).mean() + ((p_1 - p_g1)**2).sum(axis=1).mean() + ((p_3 - p_g3)**2).sum(axis=1).mean()) / 3. return [x0,x1,x3],[mask0, mask1, mask3],[x0_d_y_fake, x1_d_y_fake, x3_d_y_fake], [y0, y1], \ [x_noise_0, x_noise_1, x_noise_3], \ [maxlen_0, maxlen_1, maxlen_3], \ f_D_pred_prob, f_D_pred, f_G_produce, \ [dropout_ratio, dropout_decay_ratio], \ d_cost, g_cost
def build_and_train_rbf(self, X, Y): y_onehot = self.class_to_onehot(Y) n_dims = y_onehot.shape[1] centers = self.compute_centers(X) x = T.dmatrix() y = T.imatrix() #bias, centers, sigmas, weights template = [ n_dims, centers.shape, self.l1_size, (self.l1_size, n_dims) ] #initialize and train RBF network model = theano_rbfnet(input=x, n_cents=self.l1_size, centers=centers, n_dims=n_dims, reg=self.penalty) cost = model.neg_log_likelihood(y) g_b = T.grad(cost, model.b) g_c = T.grad(cost, model.c) g_s = T.grad(cost, model.s) g_w = T.grad(cost, model.w) g_params = T.concatenate( [g_b.flatten(), g_c.flatten(), g_s.flatten(), g_w.flatten()]) getcost = theano.function([x, y], outputs=cost) getdcost = theano.function([x, y], outputs=g_params) def cost_fcn(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getcost(x, y) def cost_grad(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getdcost(x, y) args = climin.util.iter_minibatches([X, y_onehot], self.batch_size, [0, 0]) batch_args = itertools.repeat(([X, y_onehot], {})) args = ((i, {}) for i in args) init_params = model.get_params(template) opt_sgd = climin.GradientDescent(init_params, cost_fcn, cost_grad, steprate=0.1, momentum=0.99, args=args, momentum_type="nesterov") opt_ncg = climin.NonlinearConjugateGradient(init_params, cost_fcn, cost_grad, args=batch_args) opt_lbfgs = climin.Lbfgs(init_params, cost_fcn, cost_grad, args=batch_args) #choose the optimizer if self.optimizer == 'sgd': optimizer = opt_sgd elif self.optimizer == 'ncg': optimizer = opt_ncg else: optimizer = opt_lbfgs #do the actual training. costs = [] for itr_info in optimizer: if itr_info['n_iter'] > self.max_iters: break costs.append(itr_info['loss']) model.set_params(init_params, template) return model, costs
def __init__(self, We_initial, words, memsize, rel, relsize, Rel_init, LC, LW, eta, margin, usepeep, acti): self.LC = LC self.LW = LW self.margin = margin self.memsize = memsize self.usepeep = usepeep self.relsize = relsize self.words = words self.rel = rel self.a1 = np.zeros((35, relsize, relsize)) for k in range(35): for i in range(self.a1.shape[1]): for j in range(self.a1.shape[2]): if (i == j): self.a1[k][i][j] = 1 else: self.a1[k][i][j] = 0 self.Rel = theano.shared(Rel_init).astype(theano.config.floatX) self.iden = theano.shared(self.a1) self.we = theano.shared(We_initial).astype(theano.config.floatX) g1batchindices = T.imatrix() g2batchindices = T.imatrix() p1batchindices = T.imatrix() p2batchindices = T.imatrix() g1mask = T.tensor3() g2mask = T.tensor3() p1mask = T.tensor3() p2mask = T.tensor3() g1length = T.imatrix() g2length = T.imatrix() p1length = T.imatrix() p2length = T.imatrix() target = T.dmatrix() g1mask = T.patternbroadcast(g1mask, broadcastable=[False, False, True]) g2mask = T.patternbroadcast(g2mask, broadcastable=[False, False, True]) p1mask = T.patternbroadcast(p1mask, broadcastable=[False, False, True]) p2mask = T.patternbroadcast(p2mask, broadcastable=[False, False, True]) We0 = T.dmatrix() l_in = lasagne.layers.InputLayer((None, None)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=self.we.get_value().shape[0], output_size=self.we.get_value().shape[1], W=self.we) l_out = l_emb embg1 = lasagne.layers.get_output(l_emb, { l_in: g1batchindices, l_mask: g1mask }) embg2 = lasagne.layers.get_output(l_emb, { l_in: g2batchindices, l_mask: g2mask }) embp1 = lasagne.layers.get_output(l_emb, { l_in: p1batchindices, l_mask: p1mask }) embp2 = lasagne.layers.get_output(l_emb, { l_in: p2batchindices, l_mask: p2mask }) embg1 = embg1 * g1mask embg1_sum = T.sum(embg1, axis=1) embg1_len = T.patternbroadcast(g1length, broadcastable=[False, True]) embg1_mean = embg1_sum / embg1_len embg1_mean = embg1_mean.reshape([-1, self.memsize]) embg2 = embg2 * g2mask embg2_sum = T.sum(embg2, axis=1) embg2_len = T.patternbroadcast(g2length, broadcastable=[False, True]) embg2_mean = embg2_sum / embg2_len embg2_mean = embg2_mean.reshape([-1, self.memsize]) embp1 = embp1 * p1mask embp1_sum = T.sum(embp1, axis=1) embp1_len = T.patternbroadcast(p1length, broadcastable=[False, True]) embp1_mean = embp1_sum / embp1_len embp1_mean = embp1_mean.reshape([-1, self.memsize]) embp2 = embp2 * p2mask embp2_sum = T.sum(embp2, axis=1) embp2_len = T.patternbroadcast(p2length, broadcastable=[False, True]) embp2_mean = embp2_sum / embp2_len embp2_mean = embp2_mean.reshape([-1, self.memsize]) ############################################################# r = T.ivector() p3 = T.ivector() r0 = self.Rel[r] r1 = self.Rel[p3] self.a2 = np.random.uniform(low=-0.2, high=0.2, size=[memsize, relsize]) self.a3 = np.random.uniform(low=-0.2, high=0.2, size=[ relsize, ]) self.w = theano.shared(self.a2) self.b = theano.shared(self.a3) embg1_rel = T.tanh( T.dot(embg1_mean, self.w) + self.b.dimshuffle('x', 0)) embg2_rel = T.tanh( T.dot(embg2_mean, self.w) + self.b.dimshuffle('x', 0)) embp1_rel = T.tanh( T.dot(embp1_mean, self.w) + self.b.dimshuffle('x', 0)) embp2_rel = T.tanh( T.dot(embp2_mean, self.w) + self.b.dimshuffle('x', 0)) g1g2 = T.batched_dot(embg1_rel, r0) g1g2 = T.batched_dot(g1g2, embg2_rel) p1g1_neg = T.batched_dot(embp1_rel, r0) p1g1_neg = T.batched_dot(p1g1_neg, embg2_rel) p2g2_neg = T.batched_dot(embg1_rel, r0) p2g2_neg = T.batched_dot(p2g2_neg, embp2_rel) g1g2_neg = T.batched_dot(embg1_rel, r1) g1g2_neg = T.batched_dot(g1g2_neg, embg2_rel) g1g2 = T.nnet.sigmoid(g1g2).reshape([-1, 1]) p1g1_neg = T.nnet.sigmoid(p1g1_neg).reshape([-1, 1]) p2g2_neg = T.nnet.sigmoid(p2g2_neg).reshape([-1, 1]) g1g2_neg = T.nnet.sigmoid(g1g2_neg).reshape([-1, 1]) lsm = T.concatenate([g1g2, p1g1_neg, p2g2_neg, g1g2_neg], axis=0) #updates network_params = lasagne.layers.get_all_params(l_out, trainable=True) network_params.append(self.w) network_params.append(self.b) self.all_params = network_params self.all_params.append(self.Rel) self.all_params.append(self.we) #feedforward self.feedforward_function = theano.function( [g1batchindices, g1mask, g1length], embg1_rel, on_unused_input='warn') # self.softMax=theano.function([ar],outputs=softmaxOutput2) #cost_function softmaxOutput = lsm.clip(1e-7, 1.0 - 1e-7) # softmaxOutput2=lsm2.clip(1e-7,1.0 - 1e-7) loss = lasagne.objectives.binary_crossentropy(softmaxOutput, target) loss = lasagne.objectives.aggregate(loss, mode='mean') l2_penalty1 = lasagne.regularization.apply_penalty(network_params, l2) cost_new = (1000 * loss) + (self.LC * l2_penalty1) + ( self.LW * lasagne.regularization.l2(r0 - self.iden[r])) #train_function updates = lasagne.updates.adagrad(cost_new, self.all_params, learning_rate=eta) self.train_function = theano.function([ g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask, g1length, g2length, p1length, p2length, r, We0, p3, target ], [cost_new, loss], updates=updates, on_unused_input='warn')
def edhmm_fit(inp, nans, n_subs, last): # inp - array containing responses, outcomes, and a switch variable witch turns off update in the presence of nans # nans - bool array pointing towards locations of nan responses and outcomes # n_subs - int value, total number of subjects (each subjects is fited to a different parameter value) # last - int value, negative value denoting number of last trials to exclude from parameter estimation # e.g. setting last = -35 excludes the last 35 trials from parameter estimation. #define the hierarchical parametric model for ED-HMM #define the hierarchical parametric model d_max = 200 #maximal value for state duration with Model() as edhmm: d = tt.arange( d_max) #vector of possible duration values from zero to d_max d = tt.tile(d, (n_subs, 1)) P = tt.ones((2, 2)) - tt.eye(2) #permutation matrix #set prior state probability theta0 = tt.ones(n_subs) / 2 #set hierarchical prior for delta parameter of prior beliefs p_0(d) dtau = HalfCauchy('dtau', beta=1) dloc = HalfCauchy('dloc', beta=dtau, shape=(n_subs, )) delta = Deterministic('delta', dloc / (1 + dloc)) #set hierarchical prior for r parameter of prior beleifs p_0(d) rtau = HalfCauchy('rtau', beta=1) rloc = HalfCauchy('rloc', beta=rtau, shape=(n_subs, )) r = Deterministic('r', 1 + rloc) #compute prior beliefs over state durations for given binomln = tt.gammaln(d + r[:, None]) - tt.gammaln(d + 1) - tt.gammaln( r[:, None]) pd0 = tt.nnet.softmax(binomln + d * log(1 - delta[:, None]) + r[:, None] * log(delta[:, None])) #set joint probability distribution joint0 = tt.stack([theta0[:, None] * pd0, (1 - theta0)[:, None] * pd0]).dimshuffle(1, 0, 2) #set hierarchical priors for response noises btau = HalfCauchy('btau', beta=1) bloc = HalfCauchy('bloc', beta=btau, shape=(n_subs, )) beta = Deterministic('beta', 1 / bloc) #set hierarchical priors for initial inital beliefs about reward probability mtau = HalfCauchy('mtau', beta=4) mloc = HalfCauchy('mloc', beta=mtau, shape=(n_subs, 2)) muA = Deterministic('muA', mloc[:, 0] / (1 + mloc[:, 0])) muB = Deterministic('muB', 1 / (1 + mloc[:, 1])) init = tt.stacklists([[10*muA, 10*(1-muA)], \ [10*muB, 10*(1-muB)]]).dimshuffle(2,0,1) #compute the posterior beleifs over states, durations, and reward probabilities (post, _) = scan(edhmm_model, sequences=[inp], outputs_info=[init, joint0], non_sequences=[pd0, P, range(n_subs)], name='edhmm') #get posterior reward probabliity and state probability a0 = init[None, :, :, 0] b0 = init[None, :, :, 1] a = tt.concatenate([a0, post[0][:-1, :, :, 0]]) b = tt.concatenate([b0, post[0][:-1, :, :, 1]]) mu = Deterministic('mu', a / (a + b)) theta = Deterministic('theta', tt.concatenate([theta0[None, :], \ post[1][:-1].sum(axis=-1)[:,:,0]])[:,:,None]) #compute choice dependend expected reward probability mean = (theta * mu + (1 - theta) * mu.dot(P)) #compute expected utility U = Deterministic('U', 2 * mean - 1) #set hierarchical prior for response biases ctau = HalfCauchy('ctau', beta=1) cloc = HalfCauchy('cloc', beta=ctau, shape=(n_subs, )) c0 = Deterministic('c0', cloc / (1 + cloc)) #compute response noise and response bias modulated expected free energy G = Deterministic( 'G', beta[None, :, None] * U + log([c0, 1 - c0]).T[None, :, :]) #compute response probability for the prereversal and the reversal phase of the experiment nzero = tt.nonzero(~nans[:last]) p = Deterministic('p', tt.nnet.softmax(G[:last][nzero])) #set observation likelihood of responses responses = inp[:last, :, 0][~nans[:last]] Categorical('obs', p=p, observed=responses) #fit the model with edhmm: approx = fit(method='advi', n=50000, progressbar=True) return approx
def down_q(input, train, w): #if name == '1': # print input.tag.test_value # prior h = down_nl1(input, w) #h = T.printing.Print('h1'+name)(h) h = down_conv1(h, w) #h = T.printing.Print('h2'+name)(h) logqs = 0 # posterior if posterior in ['up_diag','up_iaf1','up_iaf2','up_iaf1_nl','up_iaf2_nl']: z = qz[0].sample logqs = qz[0].logps elif posterior == 'down_diag': rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:] rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:] _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd) z = _qz.sample logqs = _qz.logps elif posterior == 'down_tim': assert prior == 'diag' pz_mean = h[:,n_h2:n_h2+n_z,:,:] pz_logsd = h[:,n_h2+n_z:n_h2+2*n_z,:,:] qz_prec = 1./T.exp(qz[0].logvar) pz_prec = 1./T.exp(2*pz_logsd) rz_prec = qz_prec + pz_prec rz_mean = (pz_prec/rz_prec) * pz_mean + (qz_prec/rz_prec) * qz[0].mean _qz = N.rand.gaussian_diag(rz_mean, -T.log(rz_prec)) z = _qz.sample logqs = _qz.logps elif posterior == 'down_iaf1': rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:] rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:] _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd) z = _qz.sample logqs = _qz.logps # ARW transform arw_mean = posterior_conv1(z, w) arw_mean *= .1 z = (z - arw_mean) elif posterior == 'down_iaf2': rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:] rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:] _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd) z = _qz.sample logqs = _qz.logps # ARW transform arw_mean_logsd = posterior_conv1(z, w) arw_mean = arw_mean_logsd[:,::2,:,:] arw_logsd = arw_mean_logsd[:,1::2,:,:] arw_mean *= .1 arw_logsd *= .1 z = (z - arw_mean) / T.exp(arw_logsd) logqs += arw_logsd elif posterior in ['down_iaf1_nl','down_iaf1_deep']: rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:] rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:] _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd) z = _qz.sample logqs = _qz.logps # ARW transform down_context = h[:,n_conv_down_prior+2*n_z:n_conv_down_prior+2*n_z+n_h2,:,:] context = up_context[0] + down_context arw_mean = posterior_conv1(z, context, w) arw_mean *= .1 z = (z - arw_mean) elif posterior in ['down_iaf2_nl','down_iaf2_nl2','down_iaf2_deep']: rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:] rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:] _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd) z = _qz.sample # logqs = _qz.logps # specifically we block the gradient here logqs = _qz.logps_pd # ARW transform down_context = h[:,n_conv_down_prior+2*n_z:n_conv_down_prior+2*n_z+n_h2,:,:] context = up_context[0] + down_context arw_mean, arw_logsd = posterior_conv1(z, context, w) arw_mean *= .1 arw_logsd *= .1 z = (z - arw_mean) / T.exp(arw_logsd) logqs += arw_logsd if posterior == 'down_iaf2_nl2': arw_mean, arw_logsd = posterior_conv2(z, context, w) arw_mean *= .1 arw_logsd *= .1 z = (z - arw_mean) / T.exp(arw_logsd) logqs += arw_logsd # Prior if prior == 'diag': pz_mean = h[:,n_h2:n_h2+n_z,:,:] pz_logsd = h[:,n_h2+n_z:n_h2+2*n_z,:,:] logps = N.rand.gaussian_diag(pz_mean, 2*pz_logsd, z).logps elif prior == 'diag2': logps = N.rand.gaussian_diag(0*z, 0*z, z).logps pz_mean = h[:,n_h2:n_h2+n_z,:,:] pz_logsd = h[:,n_h2+n_z:n_h2+2*n_z,:,:] z = pz_mean + z * T.exp(pz_logsd) elif prior == 'made': made_context = h[:,n_h2:2*n_h2,:,:] made_mean, made_logsd = prior_conv1(z, made_context, w) made_mean *= .1 made_logsd *= .1 logps = N.rand.gaussian_diag(made_mean, 2*made_logsd, z).logps elif prior == 'bernoulli': assert posterior == 'down_bernoulli' pz_p = bernoulli_p(h[:,n_h2:n_h2+n_z,:,:]) logps = z01 * T.log(pz_p) + (1.-z01) * T.log(1.-pz_p) else: raise Exception() h_det = h[:,:n_h2,:,:] h = T.concatenate([h_det, z], axis=1) if downsample: if downsample_type == 'nn': input = N.conv.upsample2d_nearest_neighbour(input) elif downsample_type == 'conv': input = down_conv3(input, w) output = input + .1 * down_conv2(down_nl2(h, w), w) return output, logqs - logps
def apply_detector(W, jet, n_jets): map_i = [] for start, end in zip(range(0, 16, n_jets), range(4, 16+4, n_jets)): map_i.append(rectify(T.dot(W, jet[start:end]))) return T.concatenate(map_i, axis=0)
def build_and_train_nnet(self, X, Y): y_onehot = self.class_to_onehot(Y) n_in = X.shape[1] n_nodes = self.l1_size n_out = y_onehot.shape[1] x = T.dmatrix() y = T.imatrix() #bias1, bias2, weights1, weights2 template = [(n_nodes, ), (n_out, ), (n_in, n_nodes), (n_nodes, n_out)] #initialize nnet model = nnet(input=x, n_in=n_in, n_nodes=n_nodes, n_out=n_out) cost = model.neg_log_likelihood(y) g_b1 = T.grad(cost, model.b1) g_b2 = T.grad(cost, model.b2) g_w1 = T.grad(cost, model.w1) g_w2 = T.grad(cost, model.w2) g_params = T.concatenate( [g_b1.flatten(), g_b2.flatten(), g_w1.flatten(), g_w2.flatten()]) getcost = theano.function([x, y], outputs=cost) getdcost = theano.function([x, y], outputs=g_params) def cost_fcn(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getcost(x, y) def cost_grad(params, inputs, targets): model.set_params(params, template) x = inputs y = targets return getdcost(x, y) args = climin.util.iter_minibatches([X, y_onehot], self.batch_size, [0, 0]) batch_args = itertools.repeat(([X, y_onehot], {})) args = ((i, {}) for i in args) init_params = model.get_params(template) opt_sgd = climin.GradientDescent(init_params, cost_fcn, cost_grad, steprate=0.01, momentum=0.99, args=args, momentum_type="nesterov") opt_ncg = climin.NonlinearConjugateGradient(init_params, cost_fcn, cost_grad, args=batch_args) opt_lbfgs = climin.Lbfgs(init_params, cost_fcn, cost_grad, args=batch_args) #choose the optimizer if self.optimizer == 'sgd': optimizer = opt_sgd elif self.optimizer == 'ncg': optimizer = opt_ncg else: optimizer = opt_lbfgs #do the actual training. costs = [] for itr_info in optimizer: if itr_info['n_iter'] > self.max_iters: break costs.append(itr_info['loss']) model.set_params(init_params, template) return model, costs
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, crowd_dim, n_crowds, training=True, crowd_reg=1.0, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) #crowd embed dim = number of tags #if crowd_dim: # crowd_dim = n_tags # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') crowd_ids = T.ivector(name='crowd_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score if crowd_dim: crowd_layer = EmbeddingLayer(n_crowds, crowd_dim, name='crowd_layer') crowd_scores = T.switch(T.neq(is_train, 0), crowd_layer.link(crowd_ids), 0 * crowd_layer.link(crowd_ids)) #final_output = T.switch(T.neq(is_train, 0), T.concatenate([final_output, crowd_scores], axis = 1), final_output) final_output = T.concatenate([final_output, crowd_scores], axis=1) #final_layer_test = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', # activation=(None if crf else 'softmax')) final_layer = HiddenLayer(word_lstm_dim + crowd_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) #tags_scores = T.switch(T.neq(is_train, 0), final_layer_train.link(final_output), final_layer_test.link(final_output)) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) if crowd_dim: self.add_component(crowd_layer) params.extend(crowd_layer.params) cost = cost + (crowd_reg * crowd_layer.params[0] * crowd_layer.params[0]).mean() # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] if crowd_dim: eval_inputs.append(crowd_ids) train_inputs.append(crowd_ids) # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} #return immediate function #return theano.function(train_inputs, tags_scores, on_unused_input='ignore', \ # givens=({is_train: np.cast['int32'](1)} if dropout else {})) # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval
def durw_fit(inp, nans, n_subs, last): # inp - array containing responses, outcomes, and a switch variable witch turns off update in the presence of nans # nans - bool array pointing towards locations of nan responses and outcomes # n_subs - int value, total number of subjects (each subjects is fited to a different parameter value) # last - int value, negative value denoting number of last trials to exclude from parameter estimation # e.g. setting last = -35 excludes the last 35 trials from parameter estimation. #define the hierarchical parametric model for DU-RW with Model() as durw: #set hierarchical priors for learning rates atau = HalfCauchy('atau', beta=1) aloc = HalfCauchy('aloc', beta=atau, shape=(n_subs, )) alpha = Deterministic('alpha', aloc / (1 + aloc)) #set hierarchical priors for coupling strengths ktau = HalfCauchy('ktau', beta=1) kloc = HalfCauchy('kloc', beta=ktau, shape=(n_subs, )) kappa = Deterministic('kappa', kloc / (1 + kloc)) #set hierarchical priors for response noises btau = HalfCauchy('btau', beta=1) bloc = HalfCauchy('bloc', beta=btau, shape=(n_subs, )) beta = Deterministic('beta', 1 / bloc) #set hierarchical priors for initial choice value mtau = HalfCauchy('mtau', beta=1) mlocA = HalfCauchy('mlocA', beta=mtau, shape=(n_subs, )) mlocB = HalfCauchy('mlocB', beta=mtau, shape=(n_subs, )) muA = Deterministic('muA', mlocA / (1 + mlocA)) muB = Deterministic('muB', 1 / (1 + mlocB)) V0 = tt.stacklists([2 * muA - 1, 2 * muB - 1]).T #compute the choice values (Q, _) = scan(durw_model, sequences=[inp], outputs_info=V0, non_sequences=[alpha, kappa, range(n_subs)], name='rw') V0 = Deterministic('V0', V0[None, :, :]) V = Deterministic('V', tt.concatenate([V0, Q[:-1]])) #set hierarchical prior for response biases ctau = HalfCauchy('ctau', beta=1) cloc = HalfCauchy('cloc', beta=ctau, shape=(n_subs, )) c0 = Deterministic('c0', cloc / (1 + cloc)) #compute response noise and response bias modulated response values G = Deterministic( 'G', beta[None, :, None] * V + log([c0, 1 - c0]).T[None, :, :]) #compute response probability for the prereversal and the reversal phase of the experiment nzero = tt.nonzero(~nans[:last]) p = Deterministic('p', tt.nnet.softmax(G[:last][nzero])) #set observation likelihood of responses Categorical('obs', p=p, observed=inp[:last, :, 0][~nans[:last]]) #fit the model with durw: approx = fit(method='advi', n=50000, progressbar=True) return approx
def __init__(self, rng, input_source, input_target, label_source, batch_size, struct, coef, train=False, init_params=None): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input_source: theano.tensor.TensorType :param input: symbolic variable that describes the "Source Domain" input of the architecture (one minibatch) :type input_target: theano.tensor.TensorType :param input: symbolic variable that describes the "Target Domain" input of the architecture (one minibatch) :type xxx_struct: class NN_struct :param xxx_strucat: define the structure of each NN """ if train == True: batch_size[0] = batch_size[0] * coef.L batch_size[1] = batch_size[1] * coef.L tmp_S = input_source tmp_T = input_target tmp_l = label_source for i in range(coef.L - 1): tmp_S = T.concatenate([tmp_S, input_source], axis=0) tmp_T = T.concatenate([tmp_T, input_target], axis=0) tmp_l = T.concatenate([tmp_l, label_source], axis=0) input_source = tmp_S input_target = tmp_T label_source = tmp_l L = coef.L else: L = 1 self.L = L self.struct = struct encoder1_struct = struct.encoder1 encoder2_struct = struct.encoder2 encoder3_struct = struct.encoder3 encoder4_struct = struct.encoder4 encoder5_struct = struct.encoder5 decoder1_struct = struct.decoder1 decoder2_struct = struct.decoder2 decoder3_struct = struct.decoder3 DoC_struct = struct.DomainClassifier alpha = coef.alpha beta = coef.beta optimize = coef.optimize if init_params == None: init_params = VLDF_ANN_params() #------------------------------------------------------------------------ #Encoder 1 Neural Network: present q_\phi({z_y}_n | x_n, d_n) zero_v_S = T.zeros([batch_size[0], 1], dtype=theano.config.floatX) zero_v_T = T.zeros([batch_size[1], 1], dtype=theano.config.floatX) one_v_S = T.ones([batch_size[0], 1], dtype=theano.config.floatX) one_v_T = T.ones([batch_size[1], 1], dtype=theano.config.floatX) d_source = T.concatenate([zero_v_S, one_v_S], axis=1) xd_source = T.concatenate([input_source, d_source], axis=1) d_target = T.concatenate([one_v_T, zero_v_T], axis=1) xd_target = T.concatenate([input_target, d_target], axis=1) self.Encoder1 = nn.Gaussian_MLP(rng=rng, input_source=xd_source, input_target=xd_target, struct=encoder1_struct, batch_size=batch_size, params=init_params.EC1_params, name='Encoder1') zy_dim = encoder1_struct.mu.layer_dim[-1] self.EC_zy_S_mu = self.Encoder1.S_mu self.EC_zy_S_log_sigma = self.Encoder1.S_log_sigma self.EC_zy_S_sigma = T.exp(self.EC_zy_S_log_sigma) self.EC_zy_T_mu = self.Encoder1.T_mu self.EC_zy_T_log_sigma = self.Encoder1.T_log_sigma self.EC_zy_T_sigma = T.exp(self.EC_zy_T_log_sigma) self.zy_S = self.Encoder1.S_output self.zy_T = self.Encoder1.T_output self.Encoder1_params = self.Encoder1.params self.Encoder1_learning_rate = self.Encoder1.learning_rate self.Encoder1_decay = self.Encoder1.decay #------------------------------------------------------------------------ #Encoder 5 Neural Network: present q_\phi(y_n | {z_y}_n) self.Encoder5_pi = nn.NN_Block(rng=rng, input_source=self.zy_S, input_target=self.zy_T, struct=encoder5_struct, params=init_params.EC5_params, name='Encoder5_pi') #Sample layer self.EC_5_CSL_target = nn.CatSampleLayer( pi=self.Encoder5_pi.output_target, n_in=encoder5_struct.layer_dim[-1], batch_size=batch_size[1]) y_dim = encoder5_struct.layer_dim[-1] self.EC_y_S_pi = self.Encoder5_pi.output_source self.EC_y_T_pi = self.Encoder5_pi.output_target self.y_T = self.EC_5_CSL_target.output self.Encoder5_params = self.Encoder5_pi.params self.Encoder5_learning_rate = self.Encoder5_pi.learning_rate self.Encoder5_decay = self.Encoder5_pi.decay #------------------------------------------------------------------------ #Encoder 3 Neural Network: present q_\phi({a_y}_n | {z_y}_n, y_n) #Input Append zyy_source = T.concatenate([self.zy_S, label_source], axis=1) zyy_target = T.concatenate([self.zy_T, self.y_T], axis=1) self.Encoder3 = nn.Gaussian_MLP(rng=rng, input_source=zyy_source, input_target=zyy_target, struct=encoder3_struct, batch_size=batch_size, params=init_params.EC3_params, name='Encoder3') ay_dim = encoder3_struct.mu.layer_dim[-1] self.EC_ay_S_mu = self.Encoder3.S_mu self.EC_ay_S_log_sigma = self.Encoder3.S_log_sigma self.EC_ay_S_sigma = T.exp(self.EC_ay_S_log_sigma) self.EC_ay_T_mu = self.Encoder3.T_mu self.EC_ay_T_log_sigma = self.Encoder3.T_log_sigma self.EC_ay_T_sigma = T.exp(self.EC_ay_T_log_sigma) self.ay_S = self.Encoder3.S_output self.ay_T = self.Encoder3.T_output self.Encoder3_params = self.Encoder3.params self.Encoder3_learning_rate = self.Encoder3.learning_rate self.Encoder3_decay = self.Encoder3.decay #------------------------------------------------------------------------ #Encoder 2 Neural Network: present q_\phi({z_d}_n | x_n, d_n) self.Encoder2 = nn.Gaussian_MLP(rng=rng, input_source=xd_source, input_target=xd_target, struct=encoder2_struct, batch_size=batch_size, params=init_params.EC2_params, name='Encoder2') zd_dim = encoder2_struct.mu.layer_dim[-1] self.EC_zd_S_mu = self.Encoder2.S_mu self.EC_zd_S_log_sigma = self.Encoder2.S_log_sigma self.EC_zd_S_sigma = T.exp(self.EC_zd_S_log_sigma) self.EC_zd_T_mu = self.Encoder2.T_mu self.EC_zd_T_log_sigma = self.Encoder2.T_log_sigma self.EC_zd_T_sigma = T.exp(self.EC_zd_T_log_sigma) self.zd_S = self.Encoder2.S_output self.zd_T = self.Encoder2.T_output self.Encoder2_params = self.Encoder2.params self.Encoder2_learning_rate = self.Encoder2.learning_rate self.Encoder2_decay = self.Encoder2.decay #------------------------------------------------------------------------ #Encoder 4 Neural Network: present q_\phi({a_d}_n | {z_d}_n, d_n) #Input Append zdd_source = T.concatenate([self.zd_S, d_source], axis=1) zdd_target = T.concatenate([self.zd_T, d_target], axis=1) self.Encoder4 = nn.Gaussian_MLP(rng=rng, input_source=zdd_source, input_target=zdd_target, struct=encoder4_struct, batch_size=batch_size, params=init_params.EC4_params, name='Encoder4') ad_dim = encoder4_struct.mu.layer_dim[-1] self.EC_ad_S_mu = self.Encoder4.S_mu self.EC_ad_S_log_sigma = self.Encoder4.S_log_sigma self.EC_ad_S_sigma = T.exp(self.EC_ad_S_log_sigma) self.EC_ad_T_mu = self.Encoder4.T_mu self.EC_ad_T_log_sigma = self.Encoder4.T_log_sigma self.EC_ad_T_sigma = T.exp(self.EC_ad_T_log_sigma) self.ad_S = self.Encoder4.S_output self.ad_T = self.Encoder4.T_output self.Encoder4_params = self.Encoder4.params self.Encoder4_learning_rate = self.Encoder4.learning_rate self.Encoder4_decay = self.Encoder4.decay #------------------------------------------------------------------------ #Decoder 1 Neural Network: present p_\theta(x_n | {z_y}_n, {z_d}_n) zyzd_source = T.concatenate([self.zy_S, self.zd_S], axis=1) zyzd_target = T.concatenate([self.zy_T, self.zd_T], axis=1) self.Decoder1 = nn.Gaussian_MLP(rng=rng, input_source=zyzd_source, input_target=zyzd_target, struct=decoder1_struct, batch_size=batch_size, params=init_params.DC1_params, name='Decoder1') x_dim = decoder1_struct.mu.layer_dim[-1] self.DC_x_S_mu = self.Decoder1.S_mu self.DC_x_S_log_sigma = self.Decoder1.S_log_sigma self.DC_x_S_sigma = T.exp(self.DC_x_S_log_sigma) self.DC_x_T_mu = self.Decoder1.T_mu self.DC_x_T_log_sigma = self.Decoder1.T_log_sigma self.DC_x_T_sigma = T.exp(self.DC_x_T_log_sigma) self.Decoder1_params = self.Decoder1.params self.Decoder1_learning_rate = self.Decoder1.learning_rate self.Decoder1_decay = self.Decoder1.decay #------------------------------------------------------------------------ #Decoder 2 Neural Network: present p_\theta({z_y}_n | {a_y}_n, y_n) ayy_source = T.concatenate([self.ay_S, label_source], axis=1) ayy_target = T.concatenate([self.ay_T, self.y_T], axis=1) self.Decoder2 = nn.Gaussian_MLP(rng=rng, input_source=ayy_source, input_target=ayy_target, struct=decoder2_struct, batch_size=batch_size, params=init_params.DC2_params, name='Decoder2') self.DC_zy_S_mu = self.Decoder2.S_mu self.DC_zy_S_log_sigma = self.Decoder2.S_log_sigma self.DC_zy_S_sigma = T.exp(self.DC_zy_S_log_sigma) self.DC_zy_T_mu = self.Decoder2.T_mu self.DC_zy_T_log_sigma = self.Decoder2.T_log_sigma self.DC_zy_T_sigma = T.exp(self.DC_zy_T_log_sigma) self.Decoder2_params = self.Decoder2.params self.Decoder2_learning_rate = self.Decoder2.learning_rate self.Decoder2_decay = self.Decoder2.decay #------------------------------------------------------------------------ #Decoder 3 Neural Network: present p_\theta({z_d}_n | {a_d}_n, d_n) add_source = T.concatenate([self.ad_S, d_source], axis=1) add_target = T.concatenate([self.ad_T, d_target], axis=1) self.Decoder3 = nn.Gaussian_MLP(rng=rng, input_source=add_source, input_target=add_target, struct=decoder3_struct, batch_size=batch_size, params=init_params.DC3_params, name='Decoder3') self.DC_zd_S_mu = self.Decoder3.S_mu self.DC_zd_S_log_sigma = self.Decoder3.S_log_sigma self.DC_zd_S_sigma = T.exp(self.DC_zd_S_log_sigma) self.DC_zd_T_mu = self.Decoder3.T_mu self.DC_zd_T_log_sigma = self.Decoder3.T_log_sigma self.DC_zd_T_sigma = T.exp(self.DC_zd_T_log_sigma) self.Decoder3_params = self.Decoder3.params self.Decoder3_learning_rate = self.Decoder3.learning_rate self.Decoder3_decay = self.Decoder3.decay #------------------------------------------------------------------------ #Domain Clasiifier Neural Network: present p_\varphi(d=0|z_y) self.DomainClassifier = nn.NN_Block(rng=rng, input_source=self.zy_S, input_target=self.zy_T, struct=DoC_struct, params=init_params.DoC_params, name='DomainClassifier') self.DoC_output_S = self.DomainClassifier.output_source self.DoC_output_T = self.DomainClassifier.output_target self.DoC_params = self.DomainClassifier.params self.DoC_learning_rate = self.DomainClassifier.learning_rate self.DoC_decay = self.DomainClassifier.decay #------------------------------------------------------------------------ # Error Function Set # KL(q(zy)||p(zy)) ----------- self.KL_zy_source = er.KLGaussianGaussian( self.EC_zy_S_mu, self.EC_zy_S_log_sigma, self.DC_zy_S_mu, self.DC_zy_S_log_sigma).sum() self.KL_zy_target = er.KLGaussianGaussian( self.EC_zy_T_mu, self.EC_zy_T_log_sigma, self.DC_zy_T_mu, self.DC_zy_T_log_sigma).sum() # KL(q(zd)||p(zd)) ----------- self.KL_zd_source = er.KLGaussianGaussian( self.EC_zd_S_mu, self.EC_zd_S_log_sigma, self.DC_zd_S_mu, self.DC_zd_S_log_sigma).sum() self.KL_zd_target = er.KLGaussianGaussian( self.EC_zd_T_mu, self.EC_zd_T_log_sigma, self.DC_zd_T_mu, self.DC_zd_T_log_sigma).sum() # KL(q(ay)||p(ay)) ----------- self.KL_ay_source = er.KLGaussianStdGaussian( self.EC_ay_S_mu, self.EC_ay_S_log_sigma).sum() self.KL_ay_target = er.KLGaussianStdGaussian( self.EC_ay_T_mu, self.EC_ay_T_log_sigma).sum() # KL(q(ad)||p(ad)) ----------- self.KL_ad_source = er.KLGaussianStdGaussian( self.EC_ad_S_mu, self.EC_ad_S_log_sigma).sum() self.KL_ad_target = er.KLGaussianStdGaussian( self.EC_ad_T_mu, self.EC_ad_T_log_sigma).sum() # KL(q(y)||p(y)) only target data----------- # prior of y is set to 1/K, K is category number threshold = 0.0000001 pi_0 = T.ones([batch_size[1], y_dim], dtype=theano.config.floatX) / y_dim self.KL_y_target = T.sum( -self.EC_y_T_pi * T.log(T.maximum(self.EC_y_T_pi / pi_0, threshold)), axis=1).sum() # Likelihood q(y) only source data----------- self.LH_y_source = -T.sum( -label_source * T.log(T.maximum(self.EC_y_S_pi, threshold)), axis=1).sum() #self.LH_y_source = T.nnet.nnet.categorical_crossentropy(self.EC_y_S_pi, label_source) # Likelihood p(x) ----------- if gaussian self.LH_x_source = er.LogGaussianPDF(input_source, self.DC_x_S_mu, self.DC_x_S_log_sigma).sum() self.LH_x_target = er.LogGaussianPDF(input_target, self.DC_x_T_mu, self.DC_x_T_log_sigma).sum() # Domain classification error smaller, better self.DoC_error_S = T.sum( -d_source * T.log(T.maximum(self.DoC_output_S, threshold)), axis=1).sum() self.DoC_error_T = T.sum( -d_target * T.log(T.maximum(self.DoC_output_T, threshold)), axis=1).sum() self.DoC_error = self.DoC_error_S + self.DoC_error_T #Cost function LM_tmp = self.KL_zy_source + self.KL_zy_target + self.KL_ay_source + self.KL_ay_target \ + self.KL_zd_source + self.KL_zd_target + self.KL_ad_source + self.KL_ad_target \ + self.LH_x_source + self.LH_x_target+ self.KL_y_target + self.LH_y_source * alpha self.LM_cost = -LM_tmp / (batch_size[0] + batch_size[1]) DoC_tmp = self.DoC_error self.DoC_cost = -DoC_tmp.mean() / (batch_size[0] + batch_size[1]) * beta self.cost = self.LM_cost + self.DoC_cost # the parameters of the model self.LM_params = self.Encoder1_params + self.Encoder2_params + self.Encoder3_params \ + self.Encoder4_params + self.Encoder5_params \ + self.Decoder1_params + self.Decoder2_params + self.Decoder3_params self.params = self.LM_params + self.DoC_params self.LM_learning_rate = self.Encoder1_learning_rate + self.Encoder2_learning_rate + self.Encoder3_learning_rate \ + self.Encoder4_learning_rate + self.Encoder5_learning_rate \ + self.Decoder1_learning_rate + self.Decoder2_learning_rate + self.Decoder3_learning_rate self.learning_rate = self.LM_learning_rate + self.DoC_learning_rate self.LM_decay = self.Encoder1_decay + self.Encoder2_decay + self.Encoder3_decay \ + self.Encoder4_decay + self.Encoder5_decay \ + self.Decoder1_decay + self.Decoder2_decay + self.Decoder3_decay self.decay = self.LM_decay + self.DoC_decay if optimize == 'Adam_update': #Adam update function self.LM_updates = nn.adam(loss=self.cost, all_params=self.LM_params, all_learning_rate=self.LM_learning_rate) self.DoC_updates = nn.adam( loss=-self.DoC_cost, all_params=self.DoC_params, all_learning_rate=self.DoC_learning_rate) elif optimize == 'SGD': #Standard update function LM_gparams = [T.grad(self.cost, param) for param in self.LM_params] self.LM_params_updates = [ (LM_param, LM_param - learning_rate * LM_gparam) for LM_param, LM_gparam, learning_rate in zip( self.LM_params, LM_gparams, self.LM_learning_rate) ] self.LM_learning_rate_update = [ (learning_rate, learning_rate * decay) for learning_rate, decay in zip(self.LM_learning_rate, self.LM_decay) ] self.LM_updates = self.LM_params_updates + self.LM_learning_rate_update DoC_gparams = [ T.grad(-self.DoC_cost, param) for param in self.DoC_params ] self.DoC_params_updates = [ (DoC_param, DoC_param - learning_rate * DoC_gparam) for DoC_param, DoC_gparam, learning_rate in zip( self.DoC_params, DoC_gparams, self.DoC_learning_rate) ] self.DoC_learning_rate_update = [ (learning_rate, learning_rate * decay) for learning_rate, decay in zip(self.DoC_learning_rate, self.DoC_decay) ] self.DoC_updates = self.DoC_params_updates + self.DoC_learning_rate_update # keep track of model input self.input_source = input_source self.input_target = input_target #Predict Label self.y_pred_source = T.argmax(self.EC_y_S_pi, axis=1) self.y_pred_target = T.argmax(self.EC_y_T_pi, axis=1)
def build_model(tparams, options): x = T.matrix('x', dtype=config.floatX) d = T.matrix('d', dtype=config.floatX) y = T.matrix('y', dtype=config.floatX) mask = T.vector('mask', dtype=config.floatX) logEps = options['logEps'] emb = T.maximum(T.dot(x, tparams['W_emb']) + tparams['b_emb'], 0) if options['demoSize'] > 0: emb = T.concatenate((emb, d), axis=1) visit = T.maximum(T.dot(emb, tparams['W_hidden']) + tparams['b_hidden'], 0) results = T.nnet.softmax( T.dot(visit, tparams['W_output']) + tparams['b_output']) mask1 = (mask[:-1] * mask[1:])[:, None] mask2 = (mask[:-2] * mask[1:-1] * mask[2:])[:, None] mask3 = (mask[:-3] * mask[1:-2] * mask[2:-1] * mask[3:])[:, None] mask4 = (mask[:-4] * mask[1:-3] * mask[2:-2] * mask[3:-1] * mask[4:])[:, None] mask5 = (mask[:-5] * mask[1:-4] * mask[2:-3] * mask[3:-2] * mask[4:-1] * mask[5:])[:, None] t = None if options['numYcodes'] > 0: t = y else: t = x forward_results = results[:-1] * mask1 forward_cross_entropy = -( t[1:] * T.log(forward_results + logEps) + (1. - t[1:]) * T.log(1. - forward_results + logEps)) forward_results2 = results[:-2] * mask2 forward_cross_entropy2 = -( t[2:] * T.log(forward_results2 + logEps) + (1. - t[2:]) * T.log(1. - forward_results2 + logEps)) forward_results3 = results[:-3] * mask3 forward_cross_entropy3 = -( t[3:] * T.log(forward_results3 + logEps) + (1. - t[3:]) * T.log(1. - forward_results3 + logEps)) forward_results4 = results[:-4] * mask4 forward_cross_entropy4 = -( t[4:] * T.log(forward_results4 + logEps) + (1. - t[4:]) * T.log(1. - forward_results4 + logEps)) forward_results5 = results[:-5] * mask5 forward_cross_entropy5 = -( t[5:] * T.log(forward_results5 + logEps) + (1. - t[5:]) * T.log(1. - forward_results5 + logEps)) backward_results = results[1:] * mask1 backward_cross_entropy = -( t[:-1] * T.log(backward_results + logEps) + (1. - t[:-1]) * T.log(1. - backward_results + logEps)) backward_results2 = results[2:] * mask2 backward_cross_entropy2 = -( t[:-2] * T.log(backward_results2 + logEps) + (1. - t[:-2]) * T.log(1. - backward_results2 + logEps)) backward_results3 = results[3:] * mask3 backward_cross_entropy3 = -( t[:-3] * T.log(backward_results3 + logEps) + (1. - t[:-3]) * T.log(1. - backward_results3 + logEps)) backward_results4 = results[4:] * mask4 backward_cross_entropy4 = -( t[:-4] * T.log(backward_results4 + logEps) + (1. - t[:-4]) * T.log(1. - backward_results4 + logEps)) backward_results5 = results[5:] * mask5 backward_cross_entropy5 = -( t[:-5] * T.log(backward_results5 + logEps) + (1. - t[:-5]) * T.log(1. - backward_results5 + logEps)) visit_cost1 = (forward_cross_entropy.sum(axis=1).sum(axis=0) + backward_cross_entropy.sum(axis=1).sum(axis=0)) / ( mask1.sum() + logEps) visit_cost2 = (forward_cross_entropy2.sum(axis=1).sum(axis=0) + backward_cross_entropy2.sum(axis=1).sum(axis=0)) / ( mask2.sum() + logEps) visit_cost3 = (forward_cross_entropy3.sum(axis=1).sum(axis=0) + backward_cross_entropy3.sum(axis=1).sum(axis=0)) / ( mask3.sum() + logEps) visit_cost4 = (forward_cross_entropy4.sum(axis=1).sum(axis=0) + backward_cross_entropy4.sum(axis=1).sum(axis=0)) / ( mask4.sum() + logEps) visit_cost5 = (forward_cross_entropy5.sum(axis=1).sum(axis=0) + backward_cross_entropy5.sum(axis=1).sum(axis=0)) / ( mask5.sum() + logEps) windowSize = options['windowSize'] visit_cost = visit_cost1 if windowSize == 2: visit_cost = visit_cost1 + visit_cost2 elif windowSize == 3: visit_cost = visit_cost1 + visit_cost2 + visit_cost3 elif windowSize == 4: visit_cost = visit_cost1 + visit_cost2 + visit_cost3 + visit_cost4 elif windowSize == 5: visit_cost = visit_cost1 + visit_cost2 + visit_cost3 + visit_cost4 + visit_cost5 iVector = T.vector('iVector', dtype='int32') jVector = T.vector('jVector', dtype='int32') preVec = T.maximum(tparams['W_emb'], 0) norms = (T.exp(T.dot(preVec, preVec.T))).sum(axis=1) emb_cost = -T.log((T.exp((preVec[iVector] * preVec[jVector]).sum(axis=1)) / norms[iVector]) + logEps) total_cost = visit_cost + T.mean( emb_cost) + options['L2_reg'] * (tparams['W_emb']**2).sum() if options['demoSize'] > 0 and options['numYcodes'] > 0: return x, d, y, mask, iVector, jVector, total_cost elif options['demoSize'] == 0 and options['numYcodes'] > 0: return x, y, mask, iVector, jVector, total_cost elif options['demoSize'] > 0 and options['numYcodes'] == 0: return x, d, mask, iVector, jVector, total_cost else: return x, mask, iVector, jVector, total_cost
def __theano_build__(self): E, V, U, W, b, c, W_att, b_att = self.E, self.V, self.U, self.W, self.b, self.c, self.W_att, self.b_att x_a = T.ivector('x_a') x_b = T.ivector('x_b') y = T.lvector('y') def forward_direction_step(x_t, s_t_prev): # Word embedding layer x_e = E[:, x_t] # GRU layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev)) + b[0] r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev)) + b[1] c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t) + b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] def backward_direction_step(x_t, s_t_prev): # Word embedding layer x_e = E[:, x_t] # GRU layer 2 z_t = T.nnet.hard_sigmoid(U[3].dot(x_e) + W[3].dot(s_t_prev)) + b[3] r_t = T.nnet.hard_sigmoid(U[4].dot(x_e) + W[4].dot(s_t_prev)) + b[4] c_t = T.tanh(U[5].dot(x_e) + W[5].dot(s_t_prev * r_t) + b[5]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] # sentence a vector (states) forward direction a_s_f, updates = theano.scan(forward_direction_step, sequences=x_a, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) backward direction a_s_b, updates = theano.scan(backward_direction_step, sequences=x_a[::-1], truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) forward direction b_s_f, updates = theano.scan(forward_direction_step, sequences=x_b, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) backward direction b_s_b, updates = theano.scan(backward_direction_step, sequences=x_b[::-1], truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # combine the sena a_s = T.concatenate([a_s_f, a_s_b[::-1]], axis=1) b_s = T.concatenate([b_s_f, b_s_b[::-1]], axis=1) def soft_attention(h_i): return T.tanh(W_att.dot(h_i) + b_att) def weight_attention(h_i, a_j): return h_i * a_j a_att, updates = theano.scan(soft_attention, sequences=a_s) b_att, updates = theano.scan(soft_attention, sequences=b_s) # softmax # a_att = (59,1) # b_att = (58,1) a_att = T.exp(a_att) a_att = a_att.flatten() a_att = a_att / a_att.sum() b_att = T.exp(b_att) b_att = b_att.flatten() b_att = b_att / b_att.sum() a_s_att, updates = theano.scan(weight_attention, sequences=[a_s, a_att]) b_s_att, updates = theano.scan(weight_attention, sequences=[b_s, b_att]) # eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX) # semantic similarity # s_sim = manhattan_distance(a_s[-1],b_s[-1]) # for classification using simple strategy # for now we still use the last word vector as sentence vector # apply a simple single hidden layer on each word in sentence # # a (wi) = attention(wi) = tanh(w_att.dot(wi)+b) # theano scan # exp(a) # sena = a_s_att.sum(axis=0) senb = b_s_att.sum(axis=0) combined_s = T.concatenate([sena, senb], axis=0) # softmax class o = T.nnet.softmax(V.dot(combined_s) + c)[0] # in case the o contains 0 which cause inf and nan eps = np.asarray([1.0e-10] * self.label_dim, dtype=theano.config.floatX) o = o + eps om = o.reshape((1, o.shape[0])) prediction = T.argmax(om, axis=1) o_error = T.nnet.categorical_crossentropy(om, y) # cost cost = T.sum(o_error) # updates updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost) # monitor parameter mV = V * T.ones_like(V) mc = c * T.ones_like(c) mU = U * T.ones_like(U) mW = W * T.ones_like(W) gV = T.grad(cost, V) gc = T.grad(cost, c) gU = T.grad(cost, U) gW = T.grad(cost, W) mgV = gV * T.ones_like(gV) mgc = gc * T.ones_like(gc) mgU = gU * T.ones_like(gU) mgW = gW * T.ones_like(gW) # Assign functions self.comsen = theano.function([x_a, x_b], [a_att, b_att]) self.monitor = theano.function([x_a, x_b], [sena, senb, mV, mc, mU, mW]) self.monitor_grad = theano.function([x_a, x_b, y], [mgV, mgc, mgU, mgW]) self.predict = theano.function([x_a, x_b], om) self.predict_class = theano.function([x_a, x_b], prediction) self.ce_error = theano.function([x_a, x_b, y], cost) # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates # find the nan self.sgd_step = theano.function( [x_a, x_b, y], [], updates=updates # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) )
def fprop(self, *args): self.out = TT.concatenate(args, axis=self.axis) return self.out
def __theano_build__(self): E = self.E W = self.W U = self.U V = self.V b = self.b c = self.c x = T.lvector('x') # y = T.lvector('y') # def forward_prop_step(x_t, h_t_prev, c_t_prev): # Word embedding layer x_e = E[:, x_t] i_t = T.nnet.sigmoid(W[0].dot(x_e) + U[0].dot(h_t_prev) + b[0]) f_t = T.nnet.sigmoid(W[1].dot(x_e) + U[1].dot(h_t_prev) + b[1]) o_t = T.nnet.sigmoid(W[2].dot(x_e) + U[2].dot(h_t_prev) + b[2]) u_t = T.tanh(W[3].dot(x_e) + U[3].dot(h_t_prev) + b[3]) c_t = i_t*u_t + f_t * c_t_prev h_t = o_t * T.tanh(c_t) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row # o = T.nnet.softmax(V.dot(h_t) + c)[0] # o = T.nnet.softmax(V[0].dot(h_t) + c) return [h_t, c_t] [h_t, c_t], updates = theano.scan(fn=forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[ dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)) ]) # o is an array for o[t] is output of time step t # we only care the output of final time step def forward_prop_step_b(x_t, h_t_prev_b, c_t_prev_b): # the backward # Word embedding layer x_e_b = E[:, x_t] i_t_b = T.nnet.sigmoid(W[4].dot(x_e_b) + U[4].dot(h_t_prev_b) + b[4]) f_t_b = T.nnet.sigmoid(W[5].dot(x_e_b) + U[5].dot(h_t_prev_b) + b[5]) o_t_b = T.nnet.sigmoid(W[6].dot(x_e_b) + U[6].dot(h_t_prev_b) + b[6]) u_t_b = T.tanh(W[7].dot(x_e_b) + U[7].dot(h_t_prev_b) + b[7]) c_t_b = i_t_b * u_t_b + f_t_b * c_t_prev_b h_t_b = o_t_b * T.tanh(c_t_b) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row # o = T.nnet.softmax(V.dot(h_t) + c)[0] # o_b = T.nnet.softmax(V[1].dot(h_t) + c) return [h_t_b, c_t_b] [h_t_b, c_t_b], updates = theano.scan(fn=forward_prop_step_b, sequences=x[::-1], truncate_gradient=self.bptt_truncate, outputs_info=[dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))]) final_h = h_t[-1] final_h_b = h_t_b[-1] final_h_concat = T.concatenate([final_h,final_h_b], axis=0) final_o = T.nnet.softmax(V[0].dot(final_h_concat) + c) # a array with one row prediction = T.argmax(final_o[0], axis=0) print('final_o', final_o.ndim) print('y ', y.ndim) final_o_error = T.sum(T.nnet.categorical_crossentropy(final_o, y)) cost = final_o_error # gradient dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # function self.predict = theano.function([x], final_o) self.predict_class = theano.function([x], prediction) self.ce_error = theano.function([x,y], cost) # SGD parameters learning_rate = T.scalar('learning_rate') self.sgd_step = theano.function([x,y,learning_rate],[], updates=[(self.U, self.U - learning_rate * dU), (self.V, self.V - learning_rate * dV), (self.W, self.W - learning_rate * dW), (self.E, self.E - learning_rate * dE), (self.b, self.b - learning_rate * db), (self.c, self.c - learning_rate * dc)])
def main(args): #theano.optimizer='fast_compile' #theano.config.exception_verbosity='high' trial = int(args['trial']) pkl_name = 'vrnn_gmm_%d' % trial channel_name = 'mse' data_path = args['data_path'] save_path = args['save_path']#+'/aggVSdisag_distrib/'+datetime.datetime.now().strftime("%y-%m-%d_%H-%M") period = int(args['period']) n_steps = int(args['n_steps']) stride_train = int(args['stride_train']) stride_test = n_steps typeLoad = int(args['typeLoad']) flgMSE = int(args['flgMSE']) monitoring_freq = int(args['monitoring_freq']) epoch = int(args['epoch']) batch_size = int(args['batch_size']) x_dim = int(args['x_dim']) y_dim = int(args['y_dim']) z_dim = int(args['z_dim']) rnn_dim = int(args['rnn_dim']) k = int(args['num_k']) #a mixture of K Gaussian functions lr = float(args['lr']) origLR = lr debug = int(args['debug']) kSchedSamp = int(args['kSchedSamp']) print "trial no. %d" % trial print "batch size %d" % batch_size print "learning rate %f" % lr print "saving pkl file '%s'" % pkl_name print "to the save path '%s'" % save_path q_z_dim = 350 p_z_dim = 400 p_x_dim = 450 x2s_dim = 400 y2s_dim = 200 z2s_dim = 350 target_dim = k# As different appliances are separeted in theta_mu1, theta_mu2, etc... each one is just created from k different Gaussians Xtrain, ytrain, Xval, yval, Xtest, ytest, reader = fetch_ukdale(data_path, windows, appliances,numApps=-1, period=period,n_steps= n_steps, stride_train = stride_train, stride_test = stride_test, flgAggSumScaled = 1, flgFilterZeros = 1, typeLoad = typeLoad) instancesPlot = {0:[10]} #instancesPlot = reader.build_dict_instances_plot(listDates, batch_size, Xval.shape[0]) train_data = UKdale(name='train', prep='normalize', cond=True,# False #path=data_path, inputX=Xtrain, labels=ytrain) X_mean = train_data.X_mean X_std = train_data.X_std valid_data = UKdale(name='valid', prep='normalize', cond=True,# False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xval, labels = yval) test_data = UKdale(name='valid', prep='normalize', cond=True,# False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xtest, labels = ytest) init_W = InitCell('rand') init_U = InitCell('ortho') init_b = InitCell('zeros') init_b_sig = InitCell('const', mean=0.6) x, mask, y , y_mask = train_data.theano_vars() scheduleSamplingMask = T.fvector('schedMask') x.name = 'x_original' if debug: x.tag.test_value = np.zeros((15, batch_size, x_dim), dtype=np.float32) temp = np.ones((15, batch_size), dtype=np.float32) temp[:, -2:] = 0. mask.tag.test_value = temp """x_1 = FullyConnectedLayer(name='x_1', parent=['x_t'], parent_dim=[x_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) y_1 = FullyConnectedLayer(name='y_1', parent=['y_t'], parent_dim=[y_dim], nout=y2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_1 = FullyConnectedLayer(name='z_1', parent=['z_t'], parent_dim=[z_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) rnn = LSTM(name='rnn', parent=['x_1', 'z_1', 'y_1'], parent_dim=[x2s_dim, z2s_dim, y2s_dim], nout=rnn_dim, unit='tanh', init_W=init_W, init_U=init_U, init_b=init_b) phi_1 = FullyConnectedLayer(name='phi_1', parent=['x_1', 's_tm1','y_1'], parent_dim=[x2s_dim, rnn_dim, y2s_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_mu = FullyConnectedLayer(name='phi_mu', parent=['phi_1'], parent_dim=[q_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) phi_sig = FullyConnectedLayer(name='phi_sig', parent=['phi_1'], parent_dim=[q_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) prior_1 = FullyConnectedLayer(name='prior_1', parent=['x_1','s_tm1'], parent_dim=[x2s_dim,rnn_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_mu = FullyConnectedLayer(name='prior_mu', parent=['prior_1'], parent_dim=[p_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) prior_sig = FullyConnectedLayer(name='prior_sig', parent=['prior_1'], parent_dim=[p_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_1 = FullyConnectedLayer(name='theta_1', parent=['z_1', 's_tm1'], parent_dim=[z2s_dim, rnn_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_mu1 = FullyConnectedLayer(name='theta_mu1', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu2 = FullyConnectedLayer(name='theta_mu2', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu3 = FullyConnectedLayer(name='theta_mu3', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu4 = FullyConnectedLayer(name='theta_mu4', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu5 = FullyConnectedLayer(name='theta_mu5', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_sig1 = FullyConnectedLayer(name='theta_sig1', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig2 = FullyConnectedLayer(name='theta_sig2', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig3 = FullyConnectedLayer(name='theta_sig3', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig4 = FullyConnectedLayer(name='theta_sig4', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig5 = FullyConnectedLayer(name='theta_sig5', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) coeff1 = FullyConnectedLayer(name='coeff1', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff2 = FullyConnectedLayer(name='coeff2', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff3 = FullyConnectedLayer(name='coeff3', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff4 = FullyConnectedLayer(name='coeff4', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff5 = FullyConnectedLayer(name='coeff5', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b)""" fmodel = open('vrnn_gmm_disall_best.pkl', 'rb') mainloop = cPickle.load(fmodel) fmodel.close() #for node in mainloop.model.nodes: # print(node.name) #define layers rnn = mainloop.model.nodes[0] x_1 = mainloop.model.nodes[1] y_1 = mainloop.model.nodes[2] z_1 = mainloop.model.nodes[3] phi_1 = mainloop.model.nodes[4] phi_mu = mainloop.model.nodes[5] phi_sig = mainloop.model.nodes[6] prior_1 = mainloop.model.nodes[7] prior_mu = mainloop.model.nodes[8] prior_sig = mainloop.model.nodes[9] theta_1 = mainloop.model.nodes[10] theta_mu1 = mainloop.model.nodes[11] theta_sig1 = mainloop.model.nodes[12] coeff1 = mainloop.model.nodes[13] nodes = [rnn, x_1, y_1,z_1, #dissag_pred, phi_1, phi_mu, phi_sig, prior_1, prior_mu, prior_sig, theta_1, theta_mu1, theta_sig1, coeff1] params = mainloop.model.params dynamicOutput = [None, None, None, None, None, None, None, None] #dynamicOutput_val = [None, None, None, None, None, None,None, None, None] if (y_dim>1): theta_mu2 = mainloop.model.nodes[14] theta_sig2 = mainloop.model.nodes[15] coeff2 = mainloop.model.nodes[16] nodes = nodes + [theta_mu2, theta_sig2, coeff2] dynamicOutput = dynamicOutput+[None, None, None, None] #mu, sig, coef and pred if (y_dim>2): theta_mu3 = mainloop.model.nodes[17] theta_sig3 = mainloop.model.nodes[18] coeff3 = mainloop.model.nodes[19] nodes = nodes + [theta_mu3, theta_sig3, coeff3] dynamicOutput = dynamicOutput +[None, None, None, None] if (y_dim>3): theta_mu4 = mainloop.model.nodes[20] theta_sig4 = mainloop.model.nodes[21] coeff4 = mainloop.model.nodes[22] nodes = nodes + [theta_mu4, theta_sig4, coeff4] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim>4): theta_mu5 = mainloop.model.nodes[23] theta_sig5 = mainloop.model.nodes[24] coeff5 = mainloop.model.nodes[25] nodes = nodes + [theta_mu5, theta_sig5, coeff5] dynamicOutput = dynamicOutput + [None, None, None, None] s_0 = rnn.get_init_state(batch_size) x_1_temp = x_1.fprop([x], params) y_1_temp = y_1.fprop([y], params) output_fn = [s_0] + dynamicOutput output_fn_val = [s_0] + dynamicOutput[2:] print(len(output_fn), len(output_fn_val)) def inner_fn_test(x_t, s_tm1): prior_1_t = prior_1.fprop([x_t,s_tm1], params) prior_mu_t = prior_mu.fprop([prior_1_t], params) prior_sig_t = prior_sig.fprop([prior_1_t], params) z_t = Gaussian_sample(prior_mu_t, prior_sig_t)#in the original code it is gaussian. GMM is for the generation z_1_t = z_1.fprop([z_t], params) theta_1_t = theta_1.fprop([z_1_t, s_tm1], params) theta_mu1_t = theta_mu1.fprop([theta_1_t], params) theta_sig1_t = theta_sig1.fprop([theta_1_t], params) coeff1_t = coeff1.fprop([theta_1_t], params) y_pred1 = GMM_sampleY(theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t) tupleMulti = prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1 if (y_dim>1): theta_mu2_t = theta_mu2.fprop([theta_1_t], params) theta_sig2_t = theta_sig2.fprop([theta_1_t], params) coeff2_t = coeff2.fprop([theta_1_t], params) y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t) y_pred1 = T.concatenate([y_pred1, y_pred2],axis=1) tupleMulti = tupleMulti + (theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2) if (y_dim>2): theta_mu3_t = theta_mu3.fprop([theta_1_t], params) theta_sig3_t = theta_sig3.fprop([theta_1_t], params) coeff3_t = coeff3.fprop([theta_1_t], params) y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t) y_pred1 = T.concatenate([y_pred1, y_pred3],axis=1) tupleMulti = tupleMulti + (theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3) if (y_dim>3): theta_mu4_t = theta_mu4.fprop([theta_1_t], params) theta_sig4_t = theta_sig4.fprop([theta_1_t], params) coeff4_t = coeff4.fprop([theta_1_t], params) y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t) y_pred1 = T.concatenate([y_pred1, y_pred4],axis=1) tupleMulti = tupleMulti + (theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4) if (y_dim>4): theta_mu5_t = theta_mu5.fprop([theta_1_t], params) theta_sig5_t = theta_sig5.fprop([theta_1_t], params) coeff5_t = coeff5.fprop([theta_1_t], params) y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t) y_pred1 = T.concatenate([y_pred1, y_pred5],axis=1) tupleMulti = tupleMulti + (theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5) pred_1_t=y_1.fprop([y_pred1], params) #y_pred = [GMM_sampleY(theta_mu_t[i], theta_sig_t[i], coeff_t[i]) for i in range(y_dim)]#T.stack([y_pred1,y_pred2],axis = 0 ) s_t = rnn.fprop([[x_t, z_1_t, pred_1_t], [s_tm1]], params) #y_pred = dissag_pred.fprop([s_t], params) return (s_t,)+tupleMulti #corr_temp, binary_temp (restResults_val, updates_val) = theano.scan(fn=inner_fn_test, sequences=[x_1_temp], outputs_info=output_fn_val ) for k, v in updates_val.iteritems(): k.default_update = v """def inner_fn(x_t, y_t, scheduleSamplingMask, s_tm1): phi_1_t = phi_1.fprop([x_t, s_tm1, y_t], params) phi_mu_t = phi_mu.fprop([phi_1_t], params) phi_sig_t = phi_sig.fprop([phi_1_t], params) prior_1_t = prior_1.fprop([x_t,s_tm1], params) prior_mu_t = prior_mu.fprop([prior_1_t], params) prior_sig_t = prior_sig.fprop([prior_1_t], params) z_t = Gaussian_sample(phi_mu_t, phi_sig_t)#in the original code it is gaussian. GMM is for the generation z_1_t = z_1.fprop([z_t], params) theta_1_t = theta_1.fprop([z_1_t, s_tm1], params) theta_mu1_t = theta_mu1.fprop([theta_1_t], params) theta_sig1_t = theta_sig1.fprop([theta_1_t], params) coeff1_t = coeff1.fprop([theta_1_t], params) y_pred1 = GMM_sampleY(theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t) y_pred = y_pred1 tupleMulti = phi_mu_t, phi_sig_t, prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1 if (y_dim>1): theta_mu2_t = theta_mu2.fprop([theta_1_t], params) theta_sig2_t = theta_sig2.fprop([theta_1_t], params) coeff2_t = coeff2.fprop([theta_1_t], params) y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t) y_pred = T.concatenate([y_pred, y_pred2],axis=1) tupleMulti = tupleMulti + (theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2) if (y_dim>2): theta_mu3_t = theta_mu3.fprop([theta_1_t], params) theta_sig3_t = theta_sig3.fprop([theta_1_t], params) coeff3_t = coeff3.fprop([theta_1_t], params) y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t) y_pred = T.concatenate([y_pred, y_pred3],axis=1) tupleMulti = tupleMulti + (theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3) if (y_dim>3): theta_mu4_t = theta_mu4.fprop([theta_1_t], params) theta_sig4_t = theta_sig4.fprop([theta_1_t], params) coeff4_t = coeff4.fprop([theta_1_t], params) y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t) y_pred = T.concatenate([y_pred, y_pred4],axis=1) tupleMulti = tupleMulti + (theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4) if (y_dim>4): theta_mu5_t = theta_mu5.fprop([theta_1_t], params) theta_sig5_t = theta_sig5.fprop([theta_1_t], params) coeff5_t = coeff5.fprop([theta_1_t], params) y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t) y_pred = T.concatenate([y_pred, y_pred5],axis=1) tupleMulti = tupleMulti + (theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5) if (scheduleSamplingMask==1): s_t = rnn.fprop([[x_t, z_1_t, y_t], [s_tm1]], params) else: y_t_aux = y_1.fprop([y_pred], params) s_t = rnn.fprop([[x_t, z_1_t, y_t_aux], [s_tm1]], params) return (s_t,)+tupleMulti #corr_temp, binary_temp (restResults, updates) = theano.scan(fn=inner_fn, sequences=[x_1_temp, y_1_temp, scheduleSamplingMask], outputs_info=output_fn ) ''' ((s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp,z_t_temp, z_1_temp, theta_1_temp, theta_mu1_temp, theta_sig1_temp, coeff1_temp, theta_mu2_temp, theta_sig2_temp, coeff2_temp, theta_mu3_temp, theta_sig3_temp, coeff3_temp, theta_mu4_temp, theta_sig4_temp, coeff4_temp, theta_mu5_temp, theta_sig5_temp, coeff5_temp, y_pred1_temp, y_pred2_temp, y_pred3_temp, y_pred4_temp, y_pred5_temp), updates) =\ theano.scan(fn=inner_fn, sequences=[x_1_temp, y_1_temp], outputs_info=[s_0, None, None, None, None, None, None, None, None,None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]) ''' s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp,\ theta_mu1_temp, theta_sig1_temp, coeff1_temp, y_pred1_temp = restResults[:9] restResults = restResults[9:] for k, v in updates.iteritems(): k.default_update = v #s_temp = concatenate([s_0[None, :, :], s_temp[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0 theta_mu1_temp.name = 'theta_mu1' theta_sig1_temp.name = 'theta_sig1' coeff1_temp.name = 'coeff1' y_pred1_temp.name = 'disaggregation1' #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1) mse1 = T.mean((y_pred1_temp - y[:,:,0].reshape((y.shape[0],y.shape[1],1)))**2) mae1 = T.mean( T.abs_(y_pred1_temp - y[:,:,0].reshape((y.shape[0],y.shape[1],1))) ) mse1.name = 'mse1' mae1.name = 'mae1' kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp) x_shape = x.shape y_shape = y.shape x_in = x.reshape((x_shape[0]*x_shape[1], -1)) y_in = y.reshape((y_shape[0]*y_shape[1], -1)) theta_mu1_in = theta_mu1_temp.reshape((x_shape[0]*x_shape[1], -1)) theta_sig1_in = theta_sig1_temp.reshape((x_shape[0]*x_shape[1], -1)) coeff1_in = coeff1_temp.reshape((x_shape[0]*x_shape[1], -1)) ddoutMSEA = [] ddoutYpreds = [y_pred1_temp] indexSepDynamic = 6 # plus one for totaMSE totaMSE = T.copy(mse1) mse2 = T.zeros((1,)) mae2 = T.zeros((1,)) mse3 = T.zeros((1,)) mae3 = T.zeros((1,)) mse4 = T.zeros((1,)) mae4 = T.zeros((1,)) mse5 = T.zeros((1,)) mae5 = T.zeros((1,)) if (y_dim>1): theta_mu2_temp, theta_sig2_temp, coeff2_temp, y_pred2_temp = restResults[:4] restResults = restResults[4:] theta_mu2_temp.name = 'theta_mu2' theta_sig2_temp.name = 'theta_sig2' coeff2_temp.name = 'coeff2' y_pred2_temp.name = 'disaggregation2' mse2 = T.mean((y_pred2_temp - y[:,:,1].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae2 = T.mean( T.abs_(y_pred2_temp - y[:,:,1].reshape((y.shape[0],y.shape[1],1))) ) mse2.name = 'mse2' mae2.name = 'mae2' theta_mu2_in = theta_mu2_temp.reshape((x_shape[0]*x_shape[1], -1)) theta_sig2_in = theta_sig2_temp.reshape((x_shape[0]*x_shape[1], -1)) coeff2_in = coeff2_temp.reshape((x_shape[0]*x_shape[1], -1)) argsGMM = theta_mu2_in, theta_sig2_in, coeff2_in ddoutMSEA = ddoutMSEA + [mse2, mae2] ddoutYpreds = ddoutYpreds + [y_pred2_temp] #totaMSE+=mse2 indexSepDynamic +=2 if (y_dim>2): theta_mu3_temp, theta_sig3_temp, coeff3_temp, y_pred3_temp = restResults[:4] restResults = restResults[4:] theta_mu3_temp.name = 'theta_mu3' theta_sig3_temp.name = 'theta_sig3' coeff3_temp.name = 'coeff3' y_pred3_temp.name = 'disaggregation3' mse3 = T.mean((y_pred3_temp - y[:,:,2].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae3 = T.mean( T.abs_(y_pred3_temp - y[:,:,2].reshape((y.shape[0],y.shape[1],1))) ) mse3.name = 'mse3' mae3.name = 'mae3' theta_mu3_in = theta_mu3_temp.reshape((x_shape[0]*x_shape[1], -1)) theta_sig3_in = theta_sig3_temp.reshape((x_shape[0]*x_shape[1], -1)) coeff3_in = coeff3_temp.reshape((x_shape[0]*x_shape[1], -1)) argsGMM = argsGMM + (theta_mu3_in, theta_sig3_in, coeff3_in) ddoutMSEA = ddoutMSEA + [mse3, mae3] ddoutYpreds = ddoutYpreds + [y_pred3_temp] #totaMSE+=mse3 indexSepDynamic +=2 if (y_dim>3): theta_mu4_temp, theta_sig4_temp, coeff4_temp, y_pred4_temp = restResults[:4] restResults = restResults[4:] theta_mu4_temp.name = 'theta_mu4' theta_sig4_temp.name = 'theta_sig4' coeff4_temp.name = 'coeff4' y_pred4_temp.name = 'disaggregation4' mse4 = T.mean((y_pred4_temp - y[:,:,3].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae4 = T.mean( T.abs_(y_pred4_temp - y[:,:,3].reshape((y.shape[0],y.shape[1],1))) ) mse4.name = 'mse4' mae4.name = 'mae4' theta_mu4_in = theta_mu4_temp.reshape((x_shape[0]*x_shape[1], -1)) theta_sig4_in = theta_sig4_temp.reshape((x_shape[0]*x_shape[1], -1)) coeff4_in = coeff4_temp.reshape((x_shape[0]*x_shape[1], -1)) argsGMM = argsGMM + (theta_mu4_in, theta_sig4_in, coeff4_in) ddoutMSEA = ddoutMSEA + [mse4, mae4] ddoutYpreds = ddoutYpreds + [y_pred4_temp] #totaMSE+=mse4 indexSepDynamic +=2 if (y_dim>4): theta_mu5_temp, theta_sig5_temp, coeff5_temp, y_pred5_temp = restResults[:4] restResults = restResults[4:] theta_mu5_temp.name = 'theta_mu5' theta_sig5_temp.name = 'theta_sig5' coeff5_temp.name = 'coeff5' y_pred5_temp.name = 'disaggregation5' mse5 = T.mean((y_pred5_temp - y[:,:,4].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae5 = T.mean( T.abs_(y_pred5_temp - y[:,:,4].reshape((y.shape[0],y.shape[1],1))) ) mse5.name = 'mse5' mae5.name = 'mae5' theta_mu5_in = theta_mu5_temp.reshape((x_shape[0]*x_shape[1], -1)) theta_sig5_in = theta_sig5_temp.reshape((x_shape[0]*x_shape[1], -1)) coeff5_in = coeff5_temp.reshape((x_shape[0]*x_shape[1], -1)) argsGMM = argsGMM + (theta_mu5_in, theta_sig5_in, coeff5_in) ddoutMSEA = ddoutMSEA + [mse5, mae5] ddoutYpreds = ddoutYpreds + [y_pred5_temp] #totaMSE+=mse5 indexSepDynamic +=2 totaMSE = (mse1+mse2+mse3+mse4+mse5)/y_dim totaMSE.name = 'mse' kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp) """ x_shape = x.shape y_shape = y.shape x_in = x.reshape((x_shape[0]*x_shape[1], -1)) y_in = y.reshape((y_shape[0]*y_shape[1], -1)) """ recon = GMMdisagMulti(y_dim, y_in, theta_mu1_in, theta_sig1_in, coeff1_in, *argsGMM)# BiGMM(x_in, theta_mu_in, theta_sig_in, coeff_in, corr_in, binary_in) recon = recon.reshape((x_shape[0], x_shape[1])) recon.name = 'gmm_out' ''' recon5 = GMM(y_in[:,4, None], theta_mu5_in, theta_sig5_in, coeff5_in) recon5 = recon.reshape((x_shape[0], x_shape[1])) ''' recon_term = recon.sum(axis=0).mean() recon_term = recon.sum(axis=0).mean() recon_term.name = 'recon_term' #kl_temp = kl_temp * mask kl_term = kl_temp.sum(axis=0).mean() kl_term.name = 'kl_term' #nll_upper_bound_0 = recon_term + kl_term #nll_upper_bound_0.name = 'nll_upper_bound_0' if (flgMSE==1): nll_upper_bound = recon_term + kl_term + totaMSE else: nll_upper_bound = recon_term + kl_term nll_upper_bound.name = 'nll_upper_bound'""" ######################## TEST (GENERATION) TIME s_temp_val, prior_mu_temp_val, prior_sig_temp_val, \ theta_mu1_temp_val, theta_sig1_temp_val, coeff1_temp_val, y_pred1_temp_val = restResults_val[:7] restResults_val = restResults_val[7:] #s_temp_val = concatenate([s_0[None, :, :], s_temp_val[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0 theta_mu1_temp_val.name = 'theta_mu1_val' theta_sig1_temp_val.name = 'theta_sig1_val' coeff1_temp_val.name = 'coeff1_val' y_pred1_temp_val.name = 'disaggregation1_val' #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1) mse1_val = T.mean((y_pred1_temp_val - y[:,:,0].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae1_val = T.mean( T.abs_(y_pred1_temp_val - y[:,:,0].reshape((y.shape[0],y.shape[1],1))) ) #NEURALNILM #(sum_output - sum_target) / max(sum_output, sum_target)) totPred = T.sum(y_pred1_temp_val) totReal = T.sum(y[:,:,0]) relErr1_val =( totPred - totReal)/ T.maximum(totPred,totReal) propAssigned1_val = 1 - T.sum(T.abs_(y_pred1_temp_val - y[:,:,0].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x)) #y_unNormalize = (y[:,:,0] * reader.stdTraining[0]) + reader.meanTraining[0] #y_pred1_temp_val = (y_pred1_temp_val * reader.stdTraining[0]) + reader.meanTraining[0] #mse1_valUnNorm = T.mean((y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae1_valUnNorm = T.mean( T.abs_(y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))) mse1_val.name = 'mse1_val' mae1_val.name = 'mae1_val' theta_mu1_in_val = theta_mu1_temp_val.reshape((x_shape[0]*x_shape[1], -1)) theta_sig1_in_val = theta_sig1_temp_val.reshape((x_shape[0]*x_shape[1], -1)) coeff1_in_val = coeff1_temp_val.reshape((x_shape[0]*x_shape[1], -1)) ddoutMSEA_val = [] ddoutYpreds_val = [y_pred1_temp_val] totaMSE_val = mse1_val totaMAE_val =mae1_val indexSepDynamic_val = 5 prediction_val = y_pred1_temp_val #Initializing values of mse and mae mse2_val = T.zeros((1,)) mae2_val = T.zeros((1,)) mse3_val = T.zeros((1,)) mae3_val = T.zeros((1,)) mse4_val = T.zeros((1,)) mae4_val = T.zeros((1,)) mse5_val = T.zeros((1,)) mae5_val = T.zeros((1,)) relErr2_val = T.zeros((1,)) relErr3_val = T.zeros((1,)) relErr4_val = T.zeros((1,)) relErr5_val = T.zeros((1,)) propAssigned2_val = T.zeros((1,)) propAssigned3_val = T.zeros((1,)) propAssigned4_val = T.zeros((1,)) propAssigned5_val = T.zeros((1,)) if (y_dim>1): theta_mu2_temp_val, theta_sig2_temp_val, coeff2_temp_val, y_pred2_temp_val = restResults_val[:4] restResults_val = restResults_val[4:] theta_mu2_temp_val.name = 'theta_mu2_val' theta_sig2_temp_val.name = 'theta_sig2_val' coeff2_temp_val.name = 'coeff2_val' y_pred2_temp_val.name = 'disaggregation2_val' mse2_val = T.mean((y_pred2_temp_val - y[:,:,1].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae2_val = T.mean( T.abs_(y_pred2_temp_val - y[:,:,1].reshape((y.shape[0],y.shape[1],1))) ) totPred = T.sum(y_pred2_temp_val) totReal = T.sum(y[:,:,1]) relErr2_val =( totPred - totReal)/ T.maximum(totPred,totReal) propAssigned2_val = 1 - T.sum(T.abs_(y_pred2_temp_val - y[:,:,1].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x)) mse2_val.name = 'mse2_val' mae2_val.name = 'mae2_val' theta_mu2_in_val = theta_mu2_temp_val.reshape((x_shape[0]*x_shape[1], -1)) theta_sig2_in_val = theta_sig2_temp_val.reshape((x_shape[0]*x_shape[1], -1)) coeff2_in_val = coeff2_temp_val.reshape((x_shape[0]*x_shape[1], -1)) argsGMM_val = theta_mu2_in_val, theta_sig2_in_val, coeff2_in_val ddoutMSEA_val = ddoutMSEA_val + [mse2_val, mae2_val] ddoutYpreds_val = ddoutYpreds_val + [y_pred2_temp_val] totaMSE_val+=mse2_val totaMAE_val+=mae2_val indexSepDynamic_val +=2 prediction_val = T.concatenate([prediction_val, y_pred2_temp_val], axis=2) if (y_dim>2): theta_mu3_temp_val, theta_sig3_temp_val, coeff3_temp_val, y_pred3_temp_val = restResults_val[:4] restResults_val = restResults_val[4:] theta_mu3_temp_val.name = 'theta_mu3_val' theta_sig3_temp_val.name = 'theta_sig3_val' coeff3_temp_val.name = 'coeff3_val' y_pred3_temp_val.name = 'disaggregation3_val' mse3_val = T.mean((y_pred3_temp_val - y[:,:,2].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae3_val = T.mean( T.abs_(y_pred3_temp_val - y[:,:,2].reshape((y.shape[0],y.shape[1],1))) ) totPred = T.sum(y_pred3_temp_val) totReal = T.sum(y[:,:,2]) relErr3_val =( totPred - totReal)/ T.maximum(totPred,totReal) propAssigned3_val = 1 - T.sum(T.abs_(y_pred3_temp_val - y[:,:,2].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x)) mse3_val.name = 'mse3_val' mae3_val.name = 'mae3_val' theta_mu3_in_val = theta_mu3_temp_val.reshape((x_shape[0]*x_shape[1], -1)) theta_sig3_in_val = theta_sig3_temp_val.reshape((x_shape[0]*x_shape[1], -1)) coeff3_in_val = coeff3_temp_val.reshape((x_shape[0]*x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu3_in_val, theta_sig3_in_val, coeff3_in_val) ddoutMSEA_val = ddoutMSEA_val + [mse3_val, mae3_val] ddoutYpreds_val = ddoutYpreds_val + [y_pred3_temp_val] totaMSE_val+=mse3_val totaMAE_val+=mae3_val indexSepDynamic_val +=2 prediction_val = T.concatenate([prediction_val, y_pred3_temp_val], axis=2) if (y_dim>3): theta_mu4_temp_val, theta_sig4_temp_val, coeff4_temp_val, y_pred4_temp_val = restResults_val[:4] restResults_val = restResults_val[4:] theta_mu4_temp_val.name = 'theta_mu4_val' theta_sig4_temp_val.name = 'theta_sig4_val' coeff4_temp_val.name = 'coeff4_val' y_pred4_temp_val.name = 'disaggregation4_val' mse4_val = T.mean((y_pred4_temp_val - y[:,:,3].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae4_val = T.mean( T.abs_(y_pred4_temp_val - y[:,:,3].reshape((y.shape[0],y.shape[1],1))) ) totPred = T.sum(y_pred4_temp_val) totReal = T.sum(y[:,:,3]) relErr4_val =( totPred - totReal)/ T.maximum(totPred,totReal) propAssigned4_val = 1 - T.sum(T.abs_(y_pred4_temp_val - y[:,:,3].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x)) mse4_val.name = 'mse4_val' mae4_val.name = 'mae4_val' theta_mu4_in_val = theta_mu4_temp_val.reshape((x_shape[0]*x_shape[1], -1)) theta_sig4_in_val = theta_sig4_temp_val.reshape((x_shape[0]*x_shape[1], -1)) coeff4_in_val = coeff4_temp_val.reshape((x_shape[0]*x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu4_in_val, theta_sig4_in_val, coeff4_in_val) ddoutMSEA_val = ddoutMSEA_val + [mse4_val, mae4_val] ddoutYpreds_val = ddoutYpreds_val + [y_pred4_temp_val] totaMSE_val+=mse4_val totaMAE_val+=mae4_val indexSepDynamic_val +=2 prediction_val = T.concatenate([prediction_val, y_pred4_temp_val], axis=2) if (y_dim>4): theta_mu5_temp_val, theta_sig5_temp_val, coeff5_temp_val, y_pred5_temp_val = restResults_val[:4] restResults_val = restResults_val[4:] theta_mu5_temp_val.name = 'theta_mu5_val' theta_sig5_temp_val.name = 'theta_sig5_val' coeff5_temp_val.name = 'coeff5_val' y_pred5_temp_val.name = 'disaggregation5_val' mse5_val = T.mean((y_pred5_temp_val - y[:,:,4].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all mae5_val = T.mean( T.abs_(y_pred5_temp_val - y[:,:,4].reshape((y.shape[0],y.shape[1],1)))) totPred = T.sum(y_pred5_temp_val) totReal = T.sum(y[:,:,4]) relErr5_val =( totPred - totReal)/ T.maximum(totPred,totReal) propAssigned5_val = 1 - T.sum(T.abs_(y_pred5_temp_val - y[:,:,4].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x)) mse5_val.name = 'mse5_val' mae5_val.name = 'mae5_val' theta_mu5_in_val = theta_mu5_temp_val.reshape((x_shape[0]*x_shape[1], -1)) theta_sig5_in_val = theta_sig5_temp_val.reshape((x_shape[0]*x_shape[1], -1)) coeff5_in_val = coeff5_temp_val.reshape((x_shape[0]*x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu5_in_val, theta_sig5_in_val, coeff5_in_val) ddoutMSEA_val = ddoutMSEA_val + [mse5_val, mae5_val] ddoutYpreds_val = ddoutYpreds_val + [y_pred5_temp_val] totaMSE_val+=mse5_val totaMAE_val+=mae5_val indexSepDynamic_val +=2 prediction_val = T.concatenate([prediction_val, y_pred5_temp_val], axis=2) recon_val = GMMdisagMulti(y_dim, y_in, theta_mu1_in_val, theta_sig1_in_val, coeff1_in_val, *argsGMM_val)# BiGMM(x_in, theta_mu_in, theta_sig_in, coeff_in, corr_in, binary_in) recon_val = recon_val.reshape((x_shape[0], x_shape[1])) recon_val.name = 'gmm_out' totaMSE_val=totaMSE_val/y_dim totaMAE_val=totaMAE_val/y_dim ''' recon5 = GMM(y_in[:,4, None], theta_mu5_in, theta_sig5_in, coeff5_in) recon5 = recon.reshape((x_shape[0], x_shape[1])) ''' recon_term_val = recon_val.sum(axis=0).mean() recon_term_val = recon_val.sum(axis=0).mean() recon_term_val.name = 'recon_term' ###################### ''' model.inputs = [x, mask, y, y_mask, scheduleSamplingMask] model.params = params model.nodes = nodes ''' optimizer = Adam( lr=lr ) header = "epoch,log,kl,nll_upper_bound,mse,mae\n" extension = [ GradientClipping(batch_size=batch_size), EpochCount(epoch, save_path, header), Monitoring(freq=monitoring_freq, #ddout=[nll_upper_bound, recon_term, kl_term, totaMSE, mse1, mae1]+ddoutMSEA+ddoutYpreds , #indexSep=indexSepDynamic, indexDDoutPlot = [13], # adding indexes of ddout for the plotting #, (6,y_pred_temp) instancesPlot = instancesPlot,#0-150 data=[Iterator(valid_data, batch_size)], savedFolder = save_path), Picklize(freq=monitoring_freq, path=save_path), EarlyStopping(freq=monitoring_freq, path=save_path, channel=channel_name), WeightNorm() ] lr_iterations = {0:lr} """mainloop = Training( name=pkl_name, data=Iterator(train_data, batch_size), model=model, optimizer=optimizer, cost=nll_upper_bound, outputs=[nll_upper_bound], n_steps = n_steps, extension=extension, lr_iterations=lr_iterations, k_speedOfconvergence=kSchedSamp ) mainloop.run()""" data=Iterator(test_data, batch_size) test_fn = theano.function(inputs=[x, y],#[x, y], #givens={x:Xtest}, #on_unused_input='ignore', #z=( ,200,1) allow_input_downcast=True, outputs=[prediction_val, recon_term_val, totaMSE_val, totaMAE_val, mse1_val,mse2_val,mse3_val,mse4_val,mse5_val, mae1_val,mae2_val,mae3_val,mae4_val,mae5_val, relErr1_val,relErr2_val,relErr3_val,relErr4_val,relErr5_val, propAssigned1_val, propAssigned2_val,propAssigned3_val,propAssigned4_val,propAssigned5_val]#prediction_val, mse_val, mae_val ,updates=updates_val ) testOutput = [] testMetrics2 = [] numBatchTest = 0 for batch in data: outputGeneration = test_fn(batch[0], batch[2]) #ERROR HERE testOutput.append(outputGeneration[1:14]) testMetrics2.append(outputGeneration[14:]) #{0:[4,20], 2:[5,10]} #if (numBatchTest==0): plt.figure(1) plt.plot(np.transpose(outputGeneration[0],[1,0,2])[4])#ORIGINAL 1,0,2 plt.legend(appliances) plt.savefig(save_path+"/vrnn_dis_generated{}_Pred_0-4".format(numBatchTest)) plt.clf() plt.figure(2) plt.plot(np.transpose(batch[2],[1,0,2])[4]) plt.legend(appliances) plt.savefig(save_path+"/vrnn_dis_generated{}_RealDisag_0-4".format(numBatchTest)) plt.clf() plt.figure(3) plt.plot(np.transpose(batch[0],[1,0,2])[4]) #ORIGINAL 1,0,2 plt.savefig(save_path+"/vrnn_dis_generated{}_Realagg_0-4".format(numBatchTest)) plt.clf() numBatchTest+=1 testOutput = np.asarray(testOutput) testMetrics2 = np.asarray(testMetrics2) print(testOutput.shape) print(testMetrics2.shape) recon_test = testOutput[:, 0].mean() mse_test = testOutput[:, 1].mean() mae_test = testOutput[:, 2].mean() mse1_test = testOutput[:, 3].mean() mae1_test = testOutput[:, 8].mean() mse2_test = testOutput[:, 4].mean() mae2_test = testOutput[:, 9].mean() mse3_test = testOutput[:, 5].mean() mae3_test = testOutput[:, 10].mean() mse4_test = testOutput[:, 6].mean() mae4_test = testOutput[:, 11].mean() mse5_test = testOutput[:, 7].mean() mae5_test = testOutput[:, 12].mean() relErr1_test = testMetrics2[:,0].mean() relErr2_test = testMetrics2[:,1].mean() relErr3_test = testMetrics2[:,2].mean() relErr4_test = testMetrics2[:,3].mean() relErr5_test = testMetrics2[:,4].mean() propAssigned1_test = testMetrics2[:, 5].mean() propAssigned2_test = testMetrics2[:, 6].mean() propAssigned3_test = testMetrics2[:, 7].mean() propAssigned4_test = testMetrics2[:, 8].mean() propAssigned5_test = testMetrics2[:, 9].mean() fLog = open(save_path+'/output.csv', 'w') fLog.write(str(lr_iterations)+"\n") fLog.write(str(appliances)+"\n") fLog.write(str(windows)+"\n") fLog.write("logTest,mse1_test,mse2_test,mse3_test,mse4_test,mse5_test,mae1_test,mae2_test,mae3_test,mae4_test,mae5_test,mseTest,maeTest\n") fLog.write("{},{},{},{},{},{},{},{},{},{},{},{},{}\n\n".format(recon_test,mse1_test,mse2_test,mse3_test, mse4_test,mse5_test,mae1_test,mae2_test,mae3_test, mae4_test,mae5_test,mse_test,mae_test)) fLog.write("relErr1,relErr2,relErr3,relErr4,relErr5,propAssigned1,propAssigned2,propAssigned3,propAssigned4,propAssigned5\n") fLog.write("{},{},{},{},{},{},{},{},{},{}\n".format(relErr1_test,relErr2_test,relErr3_test,relErr4_test, relErr5_test,propAssigned1_test,propAssigned2_test,propAssigned3_test, propAssigned4_test,propAssigned5_test)) fLog.write("q_z_dim,p_z_dim,p_x_dim,x2s_dim,y2s_dim,z2s_dim\n") fLog.write("{},{},{},{},{},{}\n".format(q_z_dim,p_z_dim,p_x_dim,x2s_dim,y2s_dim,z2s_dim)) fLog.write("epoch,log,kl,mse1,mse2,mse3,mse4,mse5,mae1,mae2,mae3,mae4,mae5\n") for i , item in enumerate(mainloop.trainlog.monitor['nll_upper_bound']): d, e, f,g,j,k,l,m = 0,0,0,0,0,0,0,0 ep = mainloop.trainlog.monitor['epoch'][i] a = mainloop.trainlog.monitor['recon_term'][i] b = mainloop.trainlog.monitor['kl_term'][i] c = mainloop.trainlog.monitor['mse1'][i] h = mainloop.trainlog.monitor['mae1'][i] if (y_dim>1): d = mainloop.trainlog.monitor['mse2'][i] j = mainloop.trainlog.monitor['mae2'][i] if (y_dim>2): e = mainloop.trainlog.monitor['mse3'][i] k = mainloop.trainlog.monitor['mae3'][i] if (y_dim>3): f = mainloop.trainlog.monitor['mse4'][i] l = mainloop.trainlog.monitor['mae4'][i] if (y_dim>4): g = mainloop.trainlog.monitor['mse5'][i] m = mainloop.trainlog.monitor['mae5'][i] fLog.write("{:d},{:.2f},{:.2f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(ep,a,b,c,d,e,f,g,h,j,k,l,m)) f = open(save_path+'/outputRealGeneration.pkl', 'wb') pickle.dump(outputGeneration, f, -1) f.close()
def _get_cost2( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj, thresh = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('thresh') self._lambda_obj, self._lambda_noobj, self._thresh = lambda_obj, lambda_noobj, thresh else: lambda_obj, lambda_noobj, thresh = self._lambda_obj, self._lambda_noobj, self._thresh cost = 0. # create grid for cells w_cell, h_cell = 1. / self.output_shape[1], 1. / self.output_shape[0] x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell) y, x = meshgrid(x, y) # reshape truth to match with cell truth_cell = truth.dimshuffle(0, 1, 2, 'x','x') x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # calculate overlap between cell and ground truth boxes xi, yi = T.maximum(truth_cell[:,:,0], x - w_cell/2), T.maximum(truth_cell[:,:,1], y - h_cell/2) xf = T.minimum(truth_cell[:,:,[0,2]].sum(axis=2), x + w_cell/2) yf = T.minimum(truth_cell[:,:,[1,3]].sum(axis=2), y + h_cell/2) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) # overlap between cell and ground truth box overlap = (w * h) / (w_cell * h_cell) # repeat truth boxes truth_boxes = truth.dimshuffle(0, 1, 'x', 2, 'x', 'x') # create grid for anchor boxes anchors = T.concatenate((x.dimshuffle(0,1,'x','x',2,3) - w_cell/2, y.dimshuffle(0,1,'x','x',2,3) - h_cell/2), axis=3) anchors = T.concatenate((anchors, T.ones_like(anchors)), axis=3) anchors = T.repeat(anchors, self.boxes.__len__(), axis=2) w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x','x',0,'x','x') h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x','x',0,'x','x') anchors = T.set_subtensor(anchors[:,:,:,2], anchors[:,:,:,2] * w_acr) anchors = T.set_subtensor(anchors[:,:,:,3], anchors[:,:,:,3] * h_acr) # find iou between anchors and ground truths xi, yi = T.maximum(truth_boxes[:,:,:,0], anchors[:,:,:,0]), T.maximum(truth_boxes[:,:,:,1], anchors[:,:,:,1]) xf = T.minimum(truth_boxes[:,:,:,[0,2]].sum(axis=3), anchors[:,:,:,[0,2]].sum(axis=3)) yf = T.minimum(truth_boxes[:,:,:,[1,3]].sum(axis=3), anchors[:,:,:,[1,3]].sum(axis=3)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h iou = isec / (T.prod(truth_boxes[:,:,:,[2,3]], axis=3) + T.prod(anchors[:,:,:,[2,3]], axis=3) - isec) overlap = overlap.dimshuffle(0,1,'x',2,3) best_iou_obj_idx = T.argmax(iou, axis=1).dimshuffle(0,'x',1,2,3) best_iou_box_idx = T.argmax(iou, axis=2).dimshuffle(0,1,'x',2,3) _,obj_idx,box_idx,_,_ = meshgrid( T.arange(truth.shape[0]), T.arange(truth.shape[1]), T.arange(self.boxes.__len__()), T.arange(self.output_shape[0]), T.arange(self.output_shape[1]) ) # define logical matrix assigning object to correct anchor box and cell. best_iou_idx = T.bitwise_and( T.bitwise_and( T.eq(best_iou_box_idx, box_idx), T.eq(best_iou_obj_idx, obj_idx) ), overlap >= thresh ) constants = [] if rescore: # scale predictions correctly pred = output.dimshuffle(0,'x',1,2,3,4) pred = T.set_subtensor(pred[:,:,:,0], pred[:,:,:,0] + x.dimshuffle(0,1,'x',2,3)) pred = T.set_subtensor(pred[:,:,:,1], pred[:,:,:,1] + y.dimshuffle(0,1,'x',2,3)) pred = T.set_subtensor(pred[:,:,:,2], w_acr * T.exp(pred[:,:,:,2])) pred = T.set_subtensor(pred[:,:,:,3], h_acr * T.exp(pred[:,:,:,3])) xi, yi = T.maximum(pred[:,:,:,0], truth_boxes[:,:,:,0]), T.maximum(pred[:,:,:,1], truth_boxes[:,:,:,1]) xf = T.minimum(pred[:,:,:,[0,2]].sum(axis=3), truth_boxes[:,:,:,[0,2]].sum(axis=3)) yf = T.minimum(pred[:,:,:,[1,3]].sum(axis=3), truth_boxes[:,:,:,[1,3]].sum(axis=3)) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) isec = w * h iou = isec / (pred[:,:,:,[2,3]].prod(axis=3) + truth_boxes[:,:,:,[2,3]].prod(axis=3) - isec) # make sure iou is considered constant when taking gradient constants.append(iou) # format ground truths correclty truth_boxes = truth_boxes = T.repeat( T.repeat( T.repeat(truth_boxes, self.boxes.__len__(), axis=2), self.output_shape[0], axis=4 ), self.output_shape[1], axis=5 ) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,0], truth_boxes[:,:,:,0] - anchors[:,:,:,0]) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,1], truth_boxes[:,:,:,1] - anchors[:,:,:,1]) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,2], T.log(truth_boxes[:,:,:,2] / anchors[:,:,:,2])) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,3], T.log(truth_boxes[:,:,:,3] / anchors[:,:,:,3])) # add dimension for objects per image pred = T.repeat(output.dimshuffle(0,'x',1,2,3,4), truth.shape[1], axis=1) # penalize coordinates cost += lambda_obj * T.mean(((pred[:,:,:,:4] - truth_boxes[:,:,:,:4])**2).sum(axis=3)[best_iou_idx.nonzero()]) # penalize class scores cost += lambda_obj * T.mean((-truth_boxes[:,:,:,-self.num_classes:] * T.log(pred[:,:,:,-self.num_classes:])).sum(axis=3)[best_iou_idx.nonzero()]) # penalize objectness score if rescore: cost += lambda_obj * T.mean(((pred[:,:,:,4] - iou)**2)[best_iou_idx.nonzero()]) else: cost += lambda_obj * T.mean(((pred[:,:,:,4] - 1.)**2)[best_iou_idx.nonzero()]) # flip all matched and penalize all un-matched objectness scores not_matched_idx = best_iou_idx.sum(axis=1) > 0 not_matched_idx = bitwise_not(not_matched_idx) # penalize objectness score for non-matched boxes cost += lambda_noobj * T.mean((pred[:,0,:,4]**2)[not_matched_idx.nonzero()]) return cost, constants
def backward(self, y): remaining = 1 - tt.sum(y[..., :], axis=-1, keepdims=True) return tt.concatenate([y[..., :], remaining], axis=-1)
def concatenate(tensors, axis=-1): return T.concatenate(tensors, axis=axis)
def _get_cost( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj, lambda_anchor = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('lambda_anchor') self._lambda_obj, self._lambda_noobj, self._lambda_anchor = lambda_obj, lambda_noobj, lambda_anchor else: lambda_obj, lambda_noobj, lambda_anchor = self._lambda_obj, self._lambda_noobj, self._lambda_anchor # lambda_obj, lambda_noobj, lambda_anchor = 1., 5., 0.1 w_cell, h_cell = 1./self.output_shape[1], 1./self.output_shape[0] x, y = T.arange(w_cell/2, 1., w_cell), T.arange(h_cell/2, 1., h_cell) y, x = meshgrid(x, y) x, y = x.dimshuffle('x','x','x',0,1), y.dimshuffle('x','x','x',0,1) # create anchors for later w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr').dimshuffle('x',0,'x','x','x') * T.ones_like(x) h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr').dimshuffle('x',0,'x','x','x') * T.ones_like(y) anchors = T.concatenate((x * T.ones_like(w_acr), y * T.ones_like(h_acr), w_acr, h_acr), axis=2) anchors = T.repeat(anchors, truth.shape[0], axis=0) cell_coord = T.concatenate((x,y), axis=2) gt_coord = (truth[:,:,:2] + truth[:,:,2:4]/2).dimshuffle(0,1,2,'x','x') gt_dist = T.sum((gt_coord - cell_coord)**2, axis=2).reshape((truth.shape[0],truth.shape[1],-1)) cell_idx = argmin_unique(gt_dist, 1, 2).reshape((-1,)) # assign unique cell to each obj per example row_idx = T.cast(cell_idx // self.output_shape[1], 'int64') col_idx = cell_idx - row_idx * self.output_shape[1] num_idx = T.repeat(T.arange(truth.shape[0]).reshape((-1,1)), truth.shape[1], axis=1).reshape((-1,)) obj_idx = T.repeat(T.arange(truth.shape[1]).reshape((1,-1)), truth.shape[0], axis=0).reshape((-1,)) valid_example = gt_dist[num_idx, obj_idx, cell_idx] < 1 # if example further than 1 away from cell it's a garbage example num_idx, obj_idx = num_idx[valid_example.nonzero()], obj_idx[valid_example.nonzero()] row_idx, col_idx = row_idx[valid_example.nonzero()], col_idx[valid_example.nonzero()] truth_flat = truth[num_idx, obj_idx, :].dimshuffle(0,'x',1) pred_matched = output[num_idx,:,:,row_idx, col_idx] x, y = x[:,0,0,row_idx, col_idx].dimshuffle(1,0), y[:,0,0,row_idx, col_idx].dimshuffle(1,0) w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr').dimshuffle('x',0) h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr').dimshuffle('x',0) # reformat prediction pred_shift = pred_matched pred_shift = T.set_subtensor(pred_shift[:,:,2], w_acr * T.exp(pred_shift[:,:,2])) pred_shift = T.set_subtensor(pred_shift[:,:,3], h_acr * T.exp(pred_shift[:,:,3])) pred_shift = T.set_subtensor(pred_shift[:,:,0], pred_shift[:,:,0] + T.repeat(x, pred_shift.shape[1], axis=1) - pred_shift[:,:,2]/2) pred_shift = T.set_subtensor(pred_shift[:,:,1], pred_shift[:,:,1] + T.repeat(y, pred_shift.shape[1], axis=1) - pred_shift[:,:,3]/2) # calculate iou xi = T.maximum(pred_shift[:,:,0], truth_flat[:,:,0]) yi = T.maximum(pred_shift[:,:,1], truth_flat[:,:,1]) xf = T.minimum(pred_shift[:,:,[0,2]].sum(axis=2), truth_flat[:,:,[0,2]].sum(axis=2)) yf = T.minimum(pred_shift[:,:,[1,3]].sum(axis=2), truth_flat[:,:,[1,3]].sum(axis=2)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h union = T.prod(pred_shift[:,:,[2,3]], axis=2) + T.prod(truth_flat[:,:,[2,3]], axis=2) - isec iou = isec / union # calculate iou for anchor anchors_matched = anchors[num_idx,:,:,row_idx,col_idx] xi = T.maximum(anchors_matched[:,:,0], truth_flat[:,:,0]) yi = T.maximum(anchors_matched[:,:,1], truth_flat[:,:,1]) xf = T.minimum(anchors_matched[:,:,[0,2]].sum(axis=2), truth_flat[:,:,[0,2]].sum(axis=2)) yf = T.minimum(anchors_matched[:,:,[1,3]].sum(axis=2), truth_flat[:,:,[1,3]].sum(axis=2)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h union = T.prod(anchors_matched[:,:,[2,3]], axis=2) + T.prod(truth_flat[:,:,[2,3]], axis=2) - isec iou_acr = isec / union # get max iou acr_idx = T.argmax(iou_acr, axis=1) # reformat truth truth_formatted = truth_flat truth_formatted = T.repeat(truth_formatted, self.boxes.__len__(), axis=1) truth_formatted = T.set_subtensor(truth_formatted[:,:,0], truth_formatted[:,:,0] + truth_formatted[:,:,2]/2 - T.repeat(x, truth_formatted.shape[1], axis=1)) truth_formatted = T.set_subtensor(truth_formatted[:,:,1], truth_formatted[:,:,1] + truth_formatted[:,:,3]/2 - T.repeat(y, truth_formatted.shape[1], axis=1)) truth_formatted = T.set_subtensor(truth_formatted[:,:,2], T.log(truth_formatted[:,:,2] / w_acr)) truth_formatted = T.set_subtensor(truth_formatted[:,:,3], T.log(truth_formatted[:,:,3] / h_acr)) truth_formatted = truth_formatted[T.arange(truth_formatted.shape[0]),acr_idx,:] # # calculate cost # item_idx = T.arange(pred_matched.shape[0]) anchors = T.set_subtensor(anchors[:,:,:2], 0.) cost = 0. cost_noobject = lambda_noobj * (T.mean(output[:,:,4]**2) - T.sum(pred_matched[item_idx, acr_idx,4]**2) / output[:,:,4].size) cost_anchor = lambda_anchor * (T.mean(T.sum(output[:,:,:4]**2, axis=2)) - T.sum(T.sum(pred_matched[item_idx,acr_idx,:4]**2, axis=1)) / output[:,:,0].size) cost_coord = lambda_obj * T.mean(T.sum((pred_matched[item_idx,acr_idx,:4] - truth_formatted[:,:4])**2, axis=1)) cost_class = lambda_obj * T.mean(T.sum(-truth_formatted[:,-self.num_classes:] * T.log(pred_matched[item_idx, acr_idx, -self.num_classes:]), axis=1)) if rescore: cost_obj = lambda_obj * T.mean((pred_matched[item_idx, acr_idx,4] - iou[item_idx, acr_idx])**2) else: cost_obj = lambda_obj * T.mean((pred_matched[item_idx, acr_idx,4] - 1)**2) cost = cost_noobject + cost_obj + cost_anchor + cost_coord + cost_class return cost, [iou], [row_idx, col_idx, acr_idx, cost_noobject, cost_anchor, cost_coord, cost_class, cost_obj]
def concat(tensor_list, axis): return T.concatenate(tensor_list=tensor_list, axis=axis)
def detect(self, im, thresh=0.75, overlap=0.5, num_to_label=None, return_iou=False): im = format_image(im, dtype=theano.config.floatX) old_size = im.shape[:2] im = cv2.resize(im, self.input_shape[::-1], interpolation=cv2.INTER_LINEAR).swapaxes(2,1).swapaxes(1,0).reshape((1,3) + self.input_shape) if not hasattr(self, '_detect_fn'): ''' Make theano do all the heavy lifting for detection, this should speed up the process marginally. ''' output = self.output_test if self.use_custom_cost: new_output = None for i in range(len(self.boxes)): cls_idx = T.arange(i * (5 + self.num_classes), (i+1) * (5 + self.num_classes)) if new_output is None: new_output = output[:,cls_idx,:,:].dimshuffle(0,'x',1,2,3) else: new_output = T.concatenate((new_output, output[:,cls_idx,:,:].dimshuffle(0,'x',1,2,3)), axis=1) output = new_output thresh_var = T.scalar(name='thresh') conf = output[:,:,4] * T.max(output[:,:,-self.num_classes:], axis=2) # define offsets to predictions w_cell, h_cell = 1. / self.output_shape[1], 1. / self.output_shape[0] x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell) y, x = meshgrid(x, y) x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # define scale w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x',0,'x','x') h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x',0,'x','x') # rescale output output = T.set_subtensor(output[:,:,2], w_acr * T.exp(output[:,:,2])) output = T.set_subtensor(output[:,:,3], h_acr * T.exp(output[:,:,3])) output = T.set_subtensor(output[:,:,0], output[:,:,0] + x - output[:,:,2] / 2) output = T.set_subtensor(output[:,:,1], output[:,:,1] + y - output[:,:,3] / 2) output = T.set_subtensor(output[:,:,2:4], output[:,:,2:4] + output[:,:,:2]) # define confidence in prediction conf = output[:,:,4] * T.max(output[:,:,-self.num_classes:], axis=2) cls = T.argmax(output[:,:,-self.num_classes:], axis=2) # filter out all below thresh above_thresh_idx = conf > thresh_var pred = T.concatenate( ( output[:,:,0][above_thresh_idx.nonzero()].dimshuffle(0,'x'), output[:,:,1][above_thresh_idx.nonzero()].dimshuffle(0,'x'), output[:,:,2][above_thresh_idx.nonzero()].dimshuffle(0,'x'), output[:,:,3][above_thresh_idx.nonzero()].dimshuffle(0,'x'), conf[above_thresh_idx.nonzero()].dimshuffle(0,'x'), cls[above_thresh_idx.nonzero()].dimshuffle(0,'x') ), axis=1 ) iou_matrix = utils.iou_matrix(pred) self._detect_fn = theano.function([self.input, thresh_var], [pred, iou_matrix]) output, iou_matrix = self._detect_fn(im, thresh) boxes = [] for i in range(output.shape[0]): coord, conf, cls = output[i,:4], output[i,4], output[i,5] coord[2:] += coord[:2] if num_to_label is not None: cls =num_to_label[cls] box = utils.BoundingBox(*coord.tolist(), confidence=conf, cls=cls) boxes.append(box) boxes = [b * old_size for b in boxes] if return_iou: return boxes, iou_matrix else: return boxes