def test_vector_clf_curve(): yt = T.fvector('yt') yp = T.fvector('yp') tps = tmetrics.classification._vector_clf_curve(yt, yp) f = theano.function([yt, yp], tps, allow_input_downcast=True) true, predicted = np.random.binomial(n=1, p=.5, size=10).astype('float32'), np.random.random(10).astype('float32') fps, tps, _ = f(true, predicted) s_fps, s_tps, s_ = sklearn.metrics.ranking._binary_clf_curve(true, predicted) np.set_printoptions(suppress=True) print 'true' print true print 'predicted' print predicted print 'fps' print fps print 'sklearn fps' print s_fps print 'tps' print tps print 'sklearn tps' print s_tps print 'threshold values' print _ print 'sklearn threshold values' print s_ assert np.allclose(fps, s_fps) assert np.allclose(tps, s_tps) assert np.allclose(_, s_)
def test_cudnn_softmax_grad_opt(self): # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is # applied when cudnn is required y = T.fvector("y") f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert len([i for i in sorted_f if isinstance(i.op, theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad)]) == 1 assert len([i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)]) == 0 # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is not # applied when cudnn is excluded or not available mode_wo_cudnn = mode_with_gpu.excluding("cudnn") y = T.fvector("y") f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_wo_cudnn) sorted_f = f.maker.fgraph.toposort() assert len([i for i in sorted_f if isinstance(i.op, theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad)]) == 0 assert len([i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)]) == 1 # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not # crash with manual graph y = T.fvector("y") o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2) f = theano.function([y], o, mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert len([i for i in sorted_f if isinstance(i.op, theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad)]) == 1 assert len([i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)]) == 0
def test_0(): N = 16*1000*10*1 if 1: aval = abs(numpy.random.randn(N).astype('float32'))+.1 bval = numpy.random.randn(N).astype('float32') a = T.fvector() b = T.fvector() else: aval = abs(numpy.random.randn(N))+.1 bval = numpy.random.randn(N) a = T.dvector() b = T.dvector() f = theano.function([a,b], T.pow(a,b), mode='LAZY') theano_opencl.elemwise.swap_impls=False g = theano.function([a,b], T.pow(a,b), mode='LAZY') print 'ocl time', timeit.Timer(lambda: f(aval, bval)).repeat(3,3) print 'gcc time', timeit.Timer(lambda: g(aval, bval)).repeat(3,3) print 'numpy time', timeit.Timer(lambda: aval**bval).repeat(3,3) assert ((f(aval, bval) - aval**bval)**2).sum() < 1.1 assert ((g(aval, bval) - aval**bval)**2).sum() < 1.1
def __init__(self, name, path, learning_rate=0.001): self.r_symbol = T.fvector('r') self.gamma_symbol = T.fscalar('gamma') self.action_symbol = T.fmatrix('action') self.y_symbol = T.fvector('y') super(ReinforcementModel, self).__init__( name, path, learning_rate=learning_rate)
def setUp(self): self.x_true = np.random.uniform(size=5).astype('float32') self.x_false = np.random.uniform(size=5).astype('float32') x_true_var = T.fvector() x_false_var = T.fvector() self.test = function(inputs=[x_true_var, x_false_var], outputs=max_margin_loss(x_true_var, x_false_var, 1))
def optimize(self, train_data, lam, fixed_length=3): i = T.iscalar('i') lr = T.fscalar('lr'); Xl = T.fvector('Xl') Xr = T.fvector('Xr') cost = self.ae.cost(Xl, Xr) #+ lam * self.ae.penalty() grads = T.grad(cost, self.ae.params) update_vars = [] for var, gvar in zip(self.ae.params, grads): if var.get_value().ndim == 1: update_vars.append((var, var - 0.1*lr*gvar)) #elif var.get_value().ndim > 1: # new_param = var - lr*gvar # len_W = T.sqrt(T.sum(new_param**2, axis=0)) # desired_W = T.clip(len_W, 0., fixed_length) # ratio = desired_W / (len_W + 1e-7) # new_param = new_param * ratio # update_vars.append((var, new_param)) else: update_vars.append((var, var - lr*gvar)) opt = theano.function([i, lr], cost, updates=update_vars, givens={Xl: train_data[i,0], Xr: train_data[i,1]})#, allow_input_downcast=True) #get_grad = theano.function([], grads[3], givens={X:train_data[0]}, allow_input_downcast=True) #get_gradb = theano.function([], grads[-1], givens={X:train_data[0]}, allow_input_downcast=True) return opt#, get_grad, get_gradb
def test_brier_score_loss_from_scikit_learn_example(): """ from sklearn docs... Examples -------- >>> import numpy as np >>> from sklearn.metrics import brier_score_loss >>> y_true = np.array([0, 1, 1, 0]) >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) >>> brier_score_loss(y_true, y_prob) 0.037... """ y_true = T.fvector('y_true') y_predicted = T.fvector('y_predicted') brier_score = tmetrics.brier_score_loss(y_true, y_predicted) f = theano.function([y_true, y_predicted], brier_score) yt = np.array([0, 1, 1, 0], 'float32') yp = np.array([.1, .9, .8, .3], theano.config.floatX) refscore = sklearn.metrics.brier_score_loss(yt, yp) tol = .01 score = f(yt, yp) assert (refscore - tol) < score < (refscore + tol) #also test the function is numpy/pandas compatible assert (refscore - tol) < tmetrics.brier_score_loss(yt, yp) < (refscore + tol)
def setUp(self): self.x_true = np.random.uniform(low=0, high=1, size=5).astype('float32') self.x_false_list = [np.random.uniform(low=0, high=1, size=5).astype('float32') for i in range(10)] x_true_var = T.fvector() x_false_var_list = [T.fvector() for t in self.x_false_list] self.test = function(inputs=[x_true_var] + x_false_var_list, outputs=negative_sampling_loss(x_true_var, x_false_var_list))
def __init__(self, input_layers, *args, **kwargs): super(RMSEObjective, self).__init__(input_layers, *args, **kwargs) self.input_systole = input_layers["systole:value"] self.input_diastole = input_layers["diastole:value"] self.target_vars["systole:value"] = T.fvector("systole_target_value") self.target_vars["diastole:value"] = T.fvector("diastole_target_value")
def theanoVecVecMul(In1,In2,opt): var1 = T.fvector('var1') var2 = T.fvector('var2') if opt=='M': var3 = T.fot(var1,var2) else: var3 = T.mul(var1,var2) DivVec = function([var1,var2],var3) return DivVec(In1,In2)
def __init__(self, num_emb, emb_dim, hidden_dim, output_dim, degree=2, learning_rate=0.01, momentum=0.9, trainable_embeddings=True, labels_on_nonroot_nodes=False): assert emb_dim > 1 and hidden_dim > 1 self.num_emb = num_emb self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.degree = degree self.learning_rate = learning_rate self.momentum = momentum self.params = [] self.embeddings = theano.shared(self.init_matrix([self.num_emb, self.emb_dim])) if trainable_embeddings: self.params.append(self.embeddings) self.x = T.ivector(name='x') # word indices self.tree = T.imatrix(name='tree') # shape [None, self.degree] if labels_on_nonroot_nodes: self.y = T.fmatrix(name='y') # output shape [None, self.output_dim] self.y_exists = T.fvector(name='y_exists') # shape [None] else: self.y = T.fvector(name='y') # output shape [self.output_dim] self.num_words = self.x.shape[0] # total number of nodes (leaves + internal) in tree emb_x = self.embeddings[self.x] emb_x = emb_x * T.neq(self.x, -1).dimshuffle(0, 'x') # zero-out non-existent embeddings self.tree_states = self.compute_tree(emb_x, self.tree) self.final_state = self.tree_states[-1] if labels_on_nonroot_nodes: self.output_fn = self.create_output_fn_multi() self.pred_y = self.output_fn(self.tree_states) self.loss = self.loss_fn_multi(self.y, self.pred_y, self.y_exists) else: self.output_fn = self.create_output_fn() self.pred_y = self.output_fn(self.final_state) self.loss = self.loss_fn(self.y, self.pred_y) updates = self.gradient_descent(self.loss) train_inputs = [self.x, self.tree, self.y] if labels_on_nonroot_nodes: train_inputs.append(self.y_exists) self._train = theano.function(train_inputs, [self.loss, self.pred_y], updates=updates) self._evaluate = theano.function([self.x, self.tree], self.final_state) self._predict = theano.function([self.x, self.tree], self.pred_y)
def test_roc_auc_score(): true = np.random.binomial(n=1, p=.5, size=50).astype('float32') #true = np.array([0, 0, 1, 1]).astype('float32') predicted = np.random.random(size=50).astype('float32') #predicted = np.array([0.1, 0.4, 0.35, 0.8]).astype('float32') yt = T.fvector('y_true') yp = T.fvector('y_predicted') roc_auc_score_expr = tmetrics.classification.roc_auc_score(yt, yp) refscore = sklearn.metrics.roc_auc_score(true, predicted) print 'refscore' print refscore f = theano.function([yt, yp], roc_auc_score_expr) score = f(true, predicted) print 'score' print score try: assert np.allclose(refscore, score) except AssertionError: fps, tps, thresholds = tmetrics.classification._binary_clf_curve(yt, yp) fpr, tpr, _thresh = tmetrics.classification.roc_curve(yt, yp) f = theano.function([yt, yp], [fps, tps, thresholds, fpr, tpr, _thresh, roc_auc_score_expr]) result = f(true, predicted) print '** tmetrics **' print 'fps' print result[0] print 'tps' print result[1] print 'thresholds' print result[2] print 'fpr' print result[3] print 'tpr' print result[4] print '_thresh' print result[5] print 'roc score' print result[6] print '** refscore **' curve = sklearn.metrics.ranking._binary_clf_curve(true, predicted) print 'fpr' print curve[0] print 'tpr' print curve[1] print 'thresholds' print curve[2] trapz = np.trapz(curve[1], curve[0]) print 'trapz' print trapz print 'auc' print sklearn.metrics.ranking.auc(curve[0], curve[1]) print 'roc_curve' print sklearn.metrics.roc_curve(true, predicted) raise
def main(): #loading in data set dataset_for_error = '/vega/stats/users/sl3368/Data_LC/NormData/LC_stim_15.mat' stimuli = load_class_data_batch(dataset_for_error) stim = stimuli[0] data = theano.shared( stim, borrow=True) print 'Number of rows: ' print stim.shape[0] #setting variable for error init = numpy.float64(0.0) mean_error = shared(init) #writing theano functions for computing mean square error for one lag prediction = T.fvector('predict') # 60 row vector representing time t real = T.fvector('real') #row representing time t+1 cost = T.mean( (real - prediction) ** 2) #function for updating mean error batch_error = theano.function([prediction,real],cost,updates=[(mean_error, mean_error + cost)]) increment = stim.shape[0]/100 #iterating over batch and computing the error for index in range(stim.shape[0]-1): if index % increment == 0: print str(index/increment)+'% done...' recent = batch_error(stim[index],stim[index+1]) #m_e_avg = mean_error / 9000000 #printing result print 'Total error: ' print mean_error.get_value() print 'Finding padding amount...' num_zero = float(0.0) #calculating zeros amount for index in range(stim.shape[0]): is_zero = True for i in range(60): if stim[index][i] != 0: is_zero = False if is_zero: num_zero = num_zero + 1 print 'Percent Zero: '+str(float(num_zero/(increment * 100)))
def test_softmax_grad(self): def cmp(n, m, f, f_gpu): data = numpy.arange(n * m, dtype="float32").reshape(n, m) gdata = numpy.asarray(data)[:, :, None, None] out = f(data) gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] utt.assert_allclose(out, gout) x = T.matrix("x", "float32") x_gpu = T.tensor4("x_gpu", "float32") f_z = T.nnet.softmax_op f_gpu = dnn.GpuDnnSoftmax("accurate", "channel") # Verify the grad operation dims = (2, 3, 4, 5) gdata = numpy.arange(numpy.product(dims), dtype="float32").reshape(dims) T.verify_grad(f_gpu, [gdata], rng=numpy.random, mode=mode_with_gpu) # Verify that the CPU and GPU implementations return the same results # up to a tolerance. self._test_softmax(x, x_gpu, f_z, f_gpu, cmp) self._test_softmax(x, x, f_z, f_z, self._cmp) # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # optimization is applied when cudnn is required y = T.fvector("y") f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert len([i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1 assert len([i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)]) == 0 # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # optimization is not applied when cudnn is excluded or not # available mode_wo_cudnn = mode_with_gpu.excluding("cudnn") y = T.fvector("y") f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_wo_cudnn) sorted_f = f.maker.fgraph.toposort() assert len([i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 0 assert len([i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)]) == 1 # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not # crash with manual graph y = T.fvector("y") o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2) f = theano.function([y], o, mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert len([i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1 assert len([i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)]) == 0
def get_div_function(self): tind = T.ivector('ind') if self.NMF_updates == 'beta': self.div = theano.function(inputs=[tind], outputs=costs.beta_div(self.X_buff[tind[1]:tind[2], ], self.W[tind[0]].T, self.H[tind[3]:tind[4], ], self.beta), name="div", allow_input_downcast=True) if self.NMF_updates == 'groupNMF': tcomp = T.ivector('comp') tlambda = T.fvector('lambda') tSc = T.ivector('Sc') tCs = T.ivector('Cs') tparams = [tind, tcomp, tlambda, tSc, tCs] cost, beta_div, cls_dist, ses_dist = costs.group_div(self.X_buff[tind[1]:tind[2], ], self.W, self.H[tind[3]:tind[4], ], self.beta, tparams) self.div = theano.function(inputs=[tind, tcomp, tlambda, tSc, tCs], outputs=[cost, beta_div, cls_dist, ses_dist], name="div", allow_input_downcast=True, on_unused_input='ignore') if self.NMF_updates == 'noiseNMF': tcomp = T.ivector('comp') tlambda = T.fvector('lambda') tSc = T.ivector('Sc') tparams = [tind, tcomp, tlambda, tSc] cost, beta_div, cls_dist, ses_dist = costs.noise_div(self.X_buff[tind[1]:tind[2], ], self.W, self.Wn, self.H[tind[3]:tind[4], ], self.beta, tparams) self.div = theano.function(inputs=[tind, tcomp, tlambda, tSc], outputs=[cost, beta_div, cls_dist, ses_dist], name="div", allow_input_downcast=True, on_unused_input='ignore')
def test_1D_roc_auc_scores(): yt = T.fvector('yt') yp = T.fvector('yp') y = np.array([0, 0, 1, 1]).astype('float32') scores = np.array([0.1, 0.4, 0.35, 0.8]).astype('float32') ref_fpr, ref_tpr, ref_thresh = sklearn.metrics.roc_curve(y, scores) roc_auc_scores = tmetrics.classification.roc_auc_scores(yt, yp) fpr, tpr, thresh = tmetrics.classification.roc_curves(yt, yp) f = theano.function([yt, yp], [fpr, tpr, thresh, roc_auc_scores]) score_fpr, score_tpr, score_thresh, score_auc = f(y ,scores) assert np.allclose(ref_fpr, score_fpr) assert np.allclose(ref_tpr, score_tpr) assert np.allclose(ref_thresh, score_thresh) assert np.allclose(sklearn.metrics.roc_auc_score(y, scores), score_auc)
def test_precisison_recall_curves_vector(n_iter=1): yt = T.fvector('yt') yp = T.fvector('yp') p_expr, r_expr, thresh_expr = tmetrics.classification.precision_recall_curves(yt, yp) f = theano.function([yt, yp], [p_expr, r_expr, thresh_expr]) for iterator in xrange(n_iter): y = np.random.binomial(n=1, p=.5, size=20).astype('float32') scores = np.random.random(20).astype('float32') ref_precision, ref_recall, ref_thresh = sklearn.metrics.precision_recall_curve(y, scores) precision, recall, thresh = f(y ,scores) #assert np.allclose(ref_precision, precision) #assert np.allclose(ref_recall, recall) #assert np.allclose(ref_thresh, thresh) try: assert np.allclose(sklearn.metrics.auc(ref_recall, ref_precision), sklearn.metrics.auc(recall, precision)) except: print 'n_iter: {}'.format(n_iter) print 'y' print y print 'scores' print scores print 'ref precision' print ref_precision print ref_precision.shape #print np.r_[precision[1:], 1] #print np.allclose(ref_precision, np.r_[precision[1:], 1] ) print sklearn.metrics.auc(ref_recall, ref_precision) print sklearn.metrics.auc(recall, precision) print print 'ref recall' print ref_recall print ref_recall.shape print print 'ref thresh' print ref_thresh print ref_thresh.shape print print 'score precision' print precision print precision.shape print print 'score recall' print recall print recall.shape print print 'score threshold' print thresh print thresh.shape raise
def test_elemwise4(): """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update""" shape = (3,4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') b = tensor.fvector() c = tensor.fvector() f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): print >> sys.stdout, i, node has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32'))
def test_multinomial_dtypes(): p = tensor.dmatrix() u = tensor.dvector() m = multinomial.MultinomialFromUniform('auto')(p, u) assert m.dtype == 'float64', m.dtype p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p, u) assert m.dtype == 'float32', m.dtype p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('float64')(p, u) assert m.dtype == 'float64', m.dtype
def test_hammming_loss(): true = np.random.binomial(n=1, p=.5, size=10).astype('float32') predicted = np.round(np.random.random(10)) refscore = hamming(true, predicted) yt = T.fvector('yt') yp = T.fvector('yp') f = theano.function([yt, yp], tmetrics.classification.hamming_loss(yt, yp), allow_input_downcast=True) score = f(true, predicted) print 'true' print true print 'predicted' print predicted print 'refscore {}'.format(refscore) print 'score {}'.format(score) assert np.allclose(refscore, score)
def test_multinomial_dtypes(): p = tensor.dmatrix() u = tensor.dvector() m = multinomial.MultinomialFromUniform("auto")(p, u) assert m.dtype == "float64", m.dtype p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform("auto")(p, u) assert m.dtype == "float32", m.dtype p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform("float64")(p, u) assert m.dtype == "float64", m.dtype
def find_sigma(X_shared, sigma_shared, N, perplexity, sigma_iters, metric, verbose=0): """Binary search on sigma for a given perplexity.""" X = T.fmatrix('X') sigma = T.fvector('sigma') target = np.log(perplexity) P = T.maximum(p_Xp_given_X_var(X, sigma, metric), epsilon) entropy = -T.sum(P*T.log(P), axis=1) # Setting update for binary search interval sigmin_shared = theano.shared(np.full(N, np.sqrt(epsilon), dtype=floath)) sigmax_shared = theano.shared(np.full(N, np.inf, dtype=floath)) sigmin = T.fvector('sigmin') sigmax = T.fvector('sigmax') upmin = T.switch(T.lt(entropy, target), sigma, sigmin) upmax = T.switch(T.gt(entropy, target), sigma, sigmax) givens = {X: X_shared, sigma: sigma_shared, sigmin: sigmin_shared, sigmax: sigmax_shared} updates = [(sigmin_shared, upmin), (sigmax_shared, upmax)] update_intervals = theano.function([], entropy, givens=givens, updates=updates) # Setting update for sigma according to search interval upsigma = T.switch(T.isinf(sigmax), sigma*2, (sigmin + sigmax)/2.) givens = {sigma: sigma_shared, sigmin: sigmin_shared, sigmax: sigmax_shared} updates = [(sigma_shared, upsigma)] update_sigma = theano.function([], sigma, givens=givens, updates=updates) for i in range(sigma_iters): e = update_intervals() update_sigma() if verbose: print('Iteration: {0}.'.format(i+1)) print('Perplexities in [{0:.4f}, {1:.4f}].'.format(np.exp(e.min()), np.exp(e.max()))) if np.any(np.isnan(np.exp(e))): raise Exception('Invalid sigmas. The perplexity is probably too low.')
def make_node(self, activations, labels, input_lengths): t_activations = T.as_tensor_variable(activations) # Ensure activations array is C-contiguous t_activations = cpu_contiguous(t_activations) t_labels = T.as_tensor_variable(labels) t_input_lengths = T.as_tensor_variable(input_lengths) if t_activations.type.dtype != 'float32': raise TypeError('activations must use the float32 type!') if t_activations.ndim != 3: raise ValueError('activations must have 3 dimensions.') if t_labels.type.dtype != 'int32': raise TypeError('labels must use the int32 type!') if t_labels.ndim != 2: raise ValueError('labels must have 2 dimensions.') if t_input_lengths.type.dtype != 'int32': raise TypeError('input_lengths must use the int32 type!') if t_input_lengths.ndim != 1: raise ValueError('input_lengths must have 1 dimension.') costs = T.fvector(name="ctc_cost") outputs = [costs] if self.compute_grad: gradients = T.ftensor3(name="ctc_grad") outputs += [gradients] return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs)
def __init__(self, nh, init_scale=0.2): self.W = theano.shared(name='W', value=init_scale * np.random.uniform(-1.0, 1.0, (nh, 1)) .astype(theano.config.floatX)) self.b = theano.shared(name='b', value=np.array(0, dtype=theano.config.floatX)) self.params = [self.b, self.W] h = T.fmatrix('h') y = T.fvector('y') lr = T.scalar('lr') y_pred = T.dot(h, self.W) + self.b loss = T.sum(T.square(y_pred[:, 0] - y)) gradients = T.grad(loss, self.params) updates = OrderedDict((p, p - lr * g) for p, g in zip(self.params, gradients)) # These all assume a minibatch size > 1; "mb" functions below will massage single examples as required self.predict = theano.function(inputs=[h], outputs=y_pred) self.calc_loss = theano.function(inputs=[h, y], outputs=loss, updates=None) self.train = theano.function(inputs=[h, y, lr], outputs=loss, updates=updates) self.calc_gradients = theano.function(inputs=[h, y], outputs=gradients, updates=None)
def test_allow_downcast_floatX(self): a = tensor.fscalar('a') b = tensor.fvector('b') f = pfunc([a, b], (a + b), allow_input_downcast=True) g = pfunc([a, b], (a + b), allow_input_downcast=False) h = pfunc([a, b], (a + b), allow_input_downcast=None) # If the values can be accurately represented, OK assert numpy.all(f(0, [0]) == 0) assert numpy.all(g(0, [0]) == 0) assert numpy.all(h(0, [0]) == 0) # For the vector: OK iff allow_input_downcast is True assert numpy.allclose(f(0, [0.1]), 0.1) self.assertRaises(TypeError, g, 0, [0.1]) self.assertRaises(TypeError, h, 0, [0.1]) # For the scalar: OK if allow_input_downcast is True, # or None and floatX==float32 assert numpy.allclose(f(0.1, [0]), 0.1) self.assertRaises(TypeError, g, 0.1, [0]) if config.floatX == 'float32': assert numpy.allclose(h(0.1, [0]), 0.1) else: self.assertRaises(TypeError, h, 0.1, [0])
def compile(self): # 1D: n_words, 2D: batch * n_cands self.x = T.imatrix() self.y = T.fvector() self.train_inputs = [self.x, self.y] self.pred_inputs = [self.x] self.activation = self.args.activation self.n_d = self.args.hidden_dim self.n_e = self.emb_layers[0].n_d self.pad_id = self.emb_layers[0].vocab_map[PAD] self.dropout = theano.shared(np.float32(self.args.dropout).astype(theano.config.floatX)) self._set_layers(args=self.args, n_d=self.n_d, n_e=self.n_e) ########### # Network # ########### h_in = self._input_layer(x=self.x) h = self._mid_layer(h_prev=h_in, x=self.x, pad_id=self.pad_id) y_scores = self._output_layer(h=h) self.y_pred = T.le(0.5, y_scores) ######################### # Set an objective func # ######################### self.set_params(layers=self.layers) self.loss = self.set_loss(self.y, y_scores) self.cost = self.set_cost(args=self.args, params=self.params, loss=self.loss)
def __init__(self, word_vec_width, batch_size, num_hidden, learning_rate=0.1): self.num_hidden = num_hidden self.learning_rate = learning_rate self.word_vec_width = word_vec_width self.batch_size = batch_size self.vocab_mat = T.fmatrix('vocab') self.word_onehot = T.fmatrix('word_onehot') b = T.fvector('b') W = T.fmatrix('W') f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b)))) s = T.sum(f) self.exec_fn = theano.function( [self.word_onehot, b, W, self.vocab_mat], f, allow_input_downcast=True) self.word_onehot_c = T.fmatrix('word_onehot_c') f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b))) s_c = T.sum(f_c) J = T.largest(0, 1 - s + s_c) self.grad = theano.grad(J, [b, W, self.vocab_mat]) self.grad_fn = theano.function( [self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat], self.grad, allow_input_downcast=True)
def test_select_distinct(self): # Tests that ChoiceFromUniform always selects distinct elements p = tensor.fmatrix() u = tensor.fvector() n = tensor.iscalar() m = multinomial.ChoiceFromUniform(odtype='auto')(p, u, n) f = function([p, u, n], m, allow_input_downcast=True) n_elements = 1000 all_indices = range(n_elements) np.random.seed(12345) expected = [ np.asarray([[931, 318, 185, 209, 559]]), np.asarray([[477, 887, 2, 717, 333, 665, 159, 559, 348, 136]]), np.asarray([[546, 28, 79, 665, 295, 779, 433, 531, 411, 716, 244, 234, 70, 88, 612, 639, 383, 335, 451, 100, 175, 492, 848, 771, 559, 214, 568, 596, 370, 486, 855, 925, 138, 300, 528, 507, 730, 199, 882, 357, 58, 195, 705, 900, 66, 468, 513, 410, 816, 672]])] for i in [5, 10, 50, 100, 500, n_elements]: uni = np.random.rand(i).astype(config.floatX) pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX) pvals /= pvals.sum(1) res = f(pvals, uni, i) for ii in range(len(expected)): if expected[ii].shape == res.shape: assert (expected[ii] == res).all() res = np.squeeze(res) assert len(res) == i assert np.all(np.in1d(np.unique(res), all_indices)), res
def test_select_proportional_to_weight(self): """ Tests that MultinomialWOReplacementFromUniform selects elements, on average, proportional to the their probabilities """ p = tensor.fmatrix() u = tensor.fvector() n = tensor.iscalar() m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n) f = function([p, u, n], m, allow_input_downcast=True) n_elements = 100 n_selected = 10 mean_rtol = 0.0005 numpy.random.seed(12345) pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX) pvals /= pvals.sum(1) avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX) for rep in range(10000): uni = numpy.random.rand(n_selected).astype(config.floatX) res = f(pvals, uni, n_selected) res = numpy.squeeze(res) avg_pvals[res] += 1 avg_pvals /= avg_pvals.sum() avg_diff = numpy.mean(abs(avg_pvals - pvals)) assert avg_diff < mean_rtol, avg_diff
def __init__(self, config=None, defaults=defaults, inputs_hook=None, hiddens_hook=None, params_hook=None, use_data_layer=None, rand_crop=None, batch_size=None): # combine everything by passing to Model's init super(AlexNet, self).__init__(**{arg: val for (arg, val) in locals().iteritems() if arg is not 'self'}) # configs can now be accessed through self dictionary if self.inputs_hook or self.hiddens_hook or self.params_hook: log.error("Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!") self.flag_datalayer = self.use_data_layer #################### # Theano variables # #################### # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data self.x = T.ftensor4('x') self.y = T.lvector('y') self.rand = T.fvector('rand') ########## # params # ########## self.params = [] # make the network! self.build_computation_graph()
def main(args): theano.optimizer = 'fast_compile' theano.config.exception_verbosity = 'high' trial = int(args['trial']) pkl_name = 'dp_disall-sch_%d' % trial channel_name = 'mae' data_path = args['data_path'] save_path = args[ 'save_path'] #+'/aggVSdisag_distrib/'+datetime.datetime.now().strftime("%y-%m-%d_%H-%M") pickleModel = args['pickleModel'] period = int(args['period']) n_steps = int(args['n_steps']) stride_train = int(args['stride_train']) stride_test = int(args['stride_test']) loadType = int(args['loadType']) flgMSE = int(args['flgMSE']) monitoring_freq = int(args['monitoring_freq']) epoch = int(args['epoch']) batch_size = int(args['batch_size']) x_dim = int(args['x_dim']) y_dim = int(args['y_dim']) z_dim = int(args['z_dim']) rnn_dim = int(args['rnn_dim']) k = int(args['num_k']) #a mixture of K Gaussian functions lr = float(args['lr']) origLR = lr debug = int(args['debug']) kSchedSamp = int(args['kSchedSamp']) print "trial no. %d" % trial print "batch size %d" % batch_size print "learning rate %f" % lr print "saving pkl file '%s'" % pkl_name print "to the save path '%s'" % save_path print(str(windows)) q_z_dim = 500 p_z_dim = 500 p_x_dim = 500 x2s_dim = 200 y2s_dim = 200 z2s_dim = 200 target_dim = k # As different appliances are separeted in theta_mu1, theta_mu2, etc... each one is just created from k different Gaussians Xtrain, ytrain, Xval, yval, Xtest, ytest, reader = fetch_dataport( data_path, windows, appliances, numApps=-1, period=period, n_steps=n_steps, stride_train=stride_train, stride_test=stride_test, trainPer=0.5, valPer=0.25, testPer=0.25, typeLoad=loadType, flgAggSumScaled=1, flgFilterZeros=1) print("Mean ", reader.meanTrain) print("Std", reader.stdTrain) instancesPlot = {0: [4]} train_data = Dataport( name='train', prep='normalize', cond=True, # False #path=data_path, inputX=Xtrain, labels=ytrain) X_mean = train_data.X_mean X_std = train_data.X_std valid_data = Dataport( name='valid', prep='normalize', cond=True, # False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xval, labels=yval) test_data = Dataport( name='valid', prep='normalize', cond=True, # False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xtest, labels=ytest) init_W = InitCell('rand') init_U = InitCell('ortho') init_b = InitCell('zeros') init_b_sig = InitCell('const', mean=0.6) x, mask, y, y_mask = train_data.theano_vars() scheduleSamplingMask = T.fvector('schedMask') x.name = 'x_original' if debug: x.tag.test_value = np.zeros((15, batch_size, x_dim), dtype=np.float32) temp = np.ones((15, batch_size), dtype=np.float32) temp[:, -2:] = 0. mask.tag.test_value = temp #from experiment 18-05-31_18-48 fmodel = open(pickleModel, 'rb') mainloop = cPickle.load(fmodel) fmodel.close() #define layers rnn = mainloop.model.nodes[0] x_1 = mainloop.model.nodes[1] y_1 = mainloop.model.nodes[2] z_1 = mainloop.model.nodes[3] phi_1 = mainloop.model.nodes[4] phi_mu = mainloop.model.nodes[5] phi_sig = mainloop.model.nodes[6] prior_1 = mainloop.model.nodes[7] prior_mu = mainloop.model.nodes[8] prior_sig = mainloop.model.nodes[9] theta_1 = mainloop.model.nodes[10] theta_mu1 = mainloop.model.nodes[11] theta_sig1 = mainloop.model.nodes[12] coeff1 = mainloop.model.nodes[13] nodes = [ rnn, x_1, y_1, z_1, #dissag_pred, phi_1, phi_mu, phi_sig, prior_1, prior_mu, prior_sig, theta_1, theta_mu1, theta_sig1, coeff1 ] params = mainloop.model.params dynamicOutput = [None, None, None, None, None, None, None, None] #dynamicOutput_val = [None, None, None, None, None, None,None, None, None] if (y_dim > 1): theta_mu2 = mainloop.model.nodes[14] theta_sig2 = mainloop.model.nodes[15] coeff2 = mainloop.model.nodes[16] nodes = nodes + [theta_mu2, theta_sig2, coeff2] dynamicOutput = dynamicOutput + [None, None, None, None ] #mu, sig, coef and pred if (y_dim > 2): theta_mu3 = mainloop.model.nodes[17] theta_sig3 = mainloop.model.nodes[18] coeff3 = mainloop.model.nodes[19] nodes = nodes + [theta_mu3, theta_sig3, coeff3] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim > 3): theta_mu4 = mainloop.model.nodes[20] theta_sig4 = mainloop.model.nodes[21] coeff4 = mainloop.model.nodes[22] nodes = nodes + [theta_mu4, theta_sig4, coeff4] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim > 4): theta_mu5 = mainloop.model.nodes[23] theta_sig5 = mainloop.model.nodes[24] coeff5 = mainloop.model.nodes[25] nodes = nodes + [theta_mu5, theta_sig5, coeff5] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim > 5): theta_mu6 = mainloop.model.nodes[26] theta_sig6 = mainloop.model.nodes[27] coeff6 = mainloop.model.nodes[28] nodes = nodes + [theta_mu6, theta_sig6, coeff6] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim > 6): theta_mu7 = mainloop.model.nodes[29] theta_sig7 = mainloop.model.nodes[30] coeff7 = mainloop.model.nodes[31] nodes = nodes + [theta_mu7, theta_sig7, coeff7] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim > 7): theta_mu8 = mainloop.model.nodes[32] theta_sig8 = mainloop.model.nodes[33] coeff8 = mainloop.model.nodes[34] nodes = nodes + [theta_mu8, theta_sig8, coeff8] dynamicOutput = dynamicOutput + [None, None, None, None] s_0 = rnn.get_init_state(batch_size) x_1_temp = x_1.fprop([x], params) y_1_temp = y_1.fprop([y], params) output_fn = [s_0] + dynamicOutput output_fn_val = [s_0] + dynamicOutput[2:] print(len(output_fn), len(output_fn_val)) def inner_fn_test(x_t, s_tm1): prior_1_t = prior_1.fprop([x_t, s_tm1], params) prior_mu_t = prior_mu.fprop([prior_1_t], params) prior_sig_t = prior_sig.fprop([prior_1_t], params) z_t = Gaussian_sample( prior_mu_t, prior_sig_t ) #in the original code it is gaussian. GMM is for the generation z_1_t = z_1.fprop([z_t], params) theta_1_t = theta_1.fprop([z_1_t, s_tm1], params) theta_mu1_t = theta_mu1.fprop([theta_1_t], params) theta_sig1_t = theta_sig1.fprop([theta_1_t], params) coeff1_t = coeff1.fprop([theta_1_t], params) y_pred1 = GMM_sampleY( theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t) tupleMulti = prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1 if (y_dim > 1): theta_mu2_t = theta_mu2.fprop([theta_1_t], params) theta_sig2_t = theta_sig2.fprop([theta_1_t], params) coeff2_t = coeff2.fprop([theta_1_t], params) y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t) y_pred1 = T.concatenate([y_pred1, y_pred2], axis=1) tupleMulti = tupleMulti + (theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2) if (y_dim > 2): theta_mu3_t = theta_mu3.fprop([theta_1_t], params) theta_sig3_t = theta_sig3.fprop([theta_1_t], params) coeff3_t = coeff3.fprop([theta_1_t], params) y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t) y_pred1 = T.concatenate([y_pred1, y_pred3], axis=1) tupleMulti = tupleMulti + (theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3) if (y_dim > 3): theta_mu4_t = theta_mu4.fprop([theta_1_t], params) theta_sig4_t = theta_sig4.fprop([theta_1_t], params) coeff4_t = coeff4.fprop([theta_1_t], params) y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t) y_pred1 = T.concatenate([y_pred1, y_pred4], axis=1) tupleMulti = tupleMulti + (theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4) if (y_dim > 4): theta_mu5_t = theta_mu5.fprop([theta_1_t], params) theta_sig5_t = theta_sig5.fprop([theta_1_t], params) coeff5_t = coeff5.fprop([theta_1_t], params) y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t) y_pred1 = T.concatenate([y_pred1, y_pred5], axis=1) tupleMulti = tupleMulti + (theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5) if (y_dim > 5): theta_mu6_t = theta_mu6.fprop([theta_1_t], params) theta_sig6_t = theta_sig6.fprop([theta_1_t], params) coeff6_t = coeff6.fprop([theta_1_t], params) y_pred6 = GMM_sampleY(theta_mu6_t, theta_sig6_t, coeff6_t) y_pred1 = T.concatenate([y_pred1, y_pred6], axis=1) tupleMulti = tupleMulti + (theta_mu6_t, theta_sig6_t, coeff6_t, y_pred6) if (y_dim > 6): theta_mu7_t = theta_mu7.fprop([theta_1_t], params) theta_sig7_t = theta_sig7.fprop([theta_1_t], params) coeff7_t = coeff7.fprop([theta_1_t], params) y_pred7 = GMM_sampleY(theta_mu7_t, theta_sig7_t, coeff7_t) y_pred1 = T.concatenate([y_pred1, y_pred7], axis=1) tupleMulti = tupleMulti + (theta_mu7_t, theta_sig7_t, coeff7_t, y_pred7) if (y_dim > 7): theta_mu8_t = theta_mu8.fprop([theta_1_t], params) theta_sig8_t = theta_sig8.fprop([theta_1_t], params) coeff8_t = coeff8.fprop([theta_1_t], params) y_pred8 = GMM_sampleY(theta_mu8_t, theta_sig8_t, coeff8_t) y_pred1 = T.concatenate([y_pred1, y_pred8], axis=1) tupleMulti = tupleMulti + (theta_mu8_t, theta_sig8_t, coeff8_t, y_pred8) pred_1_t = y_1.fprop([y_pred1], params) #y_pred = [GMM_sampleY(theta_mu_t[i], theta_sig_t[i], coeff_t[i]) for i in range(y_dim)]#T.stack([y_pred1,y_pred2],axis = 0 ) s_t = rnn.fprop([[x_t, z_1_t, pred_1_t], [s_tm1]], params) #y_pred = dissag_pred.fprop([s_t], params) return (s_t, ) + tupleMulti #corr_temp, binary_temp (otherResults_val, updates_val) = theano.scan(fn=inner_fn_test, sequences=[x_1_temp], outputs_info=output_fn_val) for k, v in updates_val.iteritems(): k.default_update = v x_shape = x.shape y_shape = y.shape x_in = x.reshape((x_shape[0] * x_shape[1], -1)) y_in = y.reshape((y_shape[0] * y_shape[1], -1)) ######################## TEST (GENERATION) TIME s_temp_val, prior_mu_temp_val, prior_sig_temp_val, \ theta_mu1_temp_val, theta_sig1_temp_val, coeff1_temp_val, y_pred1_temp_val = otherResults_val[:7] restResults_val = otherResults_val[7:] #s_temp_val = concatenate([s_0[None, :, :], s_temp_val[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0 theta_mu1_temp_val.name = 'theta_mu1_val' theta_sig1_temp_val.name = 'theta_sig1_val' coeff1_temp_val.name = 'coeff1_val' y_pred1_temp_val.name = 'disaggregation1_val' y_pred1_temp_val = T.clip(y_pred1_temp_val, 0.0, np.inf) prediction_val = y_pred1_temp_val #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1) mse1_val = T.mean((y_pred1_temp_val - y[:, :, 0].reshape( (y.shape[0], y.shape[1], 1)))**2) mae1_val = T.mean( T.abs_(y_pred1_temp_val - y[:, :, 0].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred1_temp_val) totReal = T.sum(y[:, :, 0]) relErr1_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned1_val = 1 - T.sum( T.abs_(y_pred1_temp_val - y[:, :, 0].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,0] * reader.stdTrain[0]) + reader.meanTrain[0] #y_pred1_temp_val = (y_pred1_temp_val * reader.stdTrain[0]) + reader.meanTrain[0] #mse1_valUnNorm = T.mean((y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae1_valUnNorm = T.mean( T.abs_(y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse1_val.name = 'mse1_val' mae1_val.name = 'mae1_val' theta_mu1_in_val = theta_mu1_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig1_in_val = theta_sig1_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff1_in_val = coeff1_temp_val.reshape((x_shape[0] * x_shape[1], -1)) totaMSE_val = mse1_val totaMAE_val = mae1_val indexSepDynamic_val = 5 #Initializing values of mse and mae mse2_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae2_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mse3_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae3_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mse4_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae4_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mse5_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae5_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mse6_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae6_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mse7_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae7_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mse8_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) mae8_val = T.mean(T.zeros((y.shape[0], y.shape[1], 1))) relErr2_val = T.zeros((1, )) relErr3_val = T.zeros((1, )) relErr4_val = T.zeros((1, )) relErr5_val = T.zeros((1, )) relErr6_val = T.zeros((1, )) relErr7_val = T.zeros((1, )) relErr8_val = T.zeros((1, )) propAssigned2_val = T.zeros((1, )) propAssigned3_val = T.zeros((1, )) propAssigned4_val = T.zeros((1, )) propAssigned5_val = T.zeros((1, )) propAssigned6_val = T.zeros((1, )) propAssigned7_val = T.zeros((1, )) propAssigned8_val = T.zeros((1, )) if (y_dim > 1): theta_mu2_temp_val, theta_sig2_temp_val, coeff2_temp_val, y_pred2_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu2_temp_val.name = 'theta_mu2_val' theta_sig2_temp_val.name = 'theta_sig2_val' coeff2_temp_val.name = 'coeff2_val' y_pred2_temp_val.name = 'disaggregation2_val' y_pred2_temp_val = T.clip(y_pred2_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred2_temp_val], axis=2) #before it gets unnormalized mse2_val = T.mean((y_pred2_temp_val - y[:, :, 1].reshape( (y.shape[0], y.shape[1], 1)))**2) mae2_val = T.mean( T.abs_(y_pred2_temp_val - y[:, :, 1].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred2_temp_val) totReal = T.sum(y[:, :, 1]) relErr2_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned2_val = 1 - T.sum( T.abs_(y_pred2_temp_val - y[:, :, 1].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,1] * reader.stdTrain[1]) + reader.meanTrain[1] #y_pred2_temp_val = (y_pred2_temp_val * reader.stdTrain[1]) + reader.meanTrain[1] #mse2_valUnNorm = T.mean((y_pred2_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae2_valUnNorm = T.mean( T.abs_(y_pred2_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse2_val.name = 'mse2_val' mae2_val.name = 'mae2_val' theta_mu2_in_val = theta_mu2_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig2_in_val = theta_sig2_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff2_in_val = coeff2_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = theta_mu2_in_val, theta_sig2_in_val, coeff2_in_val totaMSE_val += mse2_val totaMAE_val += mae2_val indexSepDynamic_val += 2 if (y_dim > 2): theta_mu3_temp_val, theta_sig3_temp_val, coeff3_temp_val, y_pred3_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu3_temp_val.name = 'theta_mu3_val' theta_sig3_temp_val.name = 'theta_sig3_val' coeff3_temp_val.name = 'coeff3_val' y_pred3_temp_val.name = 'disaggregation3_val' y_pred3_temp_val = T.clip(y_pred3_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred3_temp_val], axis=2) #before it gets unnormalized mse3_val = T.mean((y_pred3_temp_val - y[:, :, 2].reshape( (y.shape[0], y.shape[1], 1)))**2) mae3_val = T.mean( T.abs_(y_pred3_temp_val - y[:, :, 2].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred3_temp_val) totReal = T.sum(y[:, :, 2]) relErr3_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned3_val = 1 - T.sum( T.abs_(y_pred3_temp_val - y[:, :, 2].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,2] * reader.stdTrain[2]) + reader.meanTrain[2] #y_pred3_temp_val = (y_pred3_temp_val * reader.stdTrain[2]) + reader.meanTrain[2] #mse3_valUnNorm = T.mean((y_pred3_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae3_valUnNorm = T.mean( T.abs_(y_pred3_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse3_val.name = 'mse3_val' mae3_val.name = 'mae3_val' theta_mu3_in_val = theta_mu3_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig3_in_val = theta_sig3_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff3_in_val = coeff3_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu3_in_val, theta_sig3_in_val, coeff3_in_val) totaMSE_val += mse3_val totaMAE_val += mae3_val indexSepDynamic_val += 2 if (y_dim > 3): theta_mu4_temp_val, theta_sig4_temp_val, coeff4_temp_val, y_pred4_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu4_temp_val.name = 'theta_mu4_val' theta_sig4_temp_val.name = 'theta_sig4_val' coeff4_temp_val.name = 'coeff4_val' y_pred4_temp_val.name = 'disaggregation4_val' y_pred4_temp_val = T.clip(y_pred4_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred4_temp_val], axis=2) #before it gets unnormalized mse4_val = T.mean((y_pred4_temp_val - y[:, :, 3].reshape( (y.shape[0], y.shape[1], 1)))**2) mae4_val = T.mean( T.abs_(y_pred4_temp_val - y[:, :, 3].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred4_temp_val) totReal = T.sum(y[:, :, 3]) relErr4_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned4_val = 1 - T.sum( T.abs_(y_pred4_temp_val - y[:, :, 3].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,3] * reader.stdTrain[3]) + reader.meanTrain[3] #y_pred4_temp_val = (y_pred4_temp_val * reader.stdTrain[3]) + reader.meanTrain[3] #mse4_valUnNorm = T.mean((y_pred4_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae4_valUnNorm = T.mean( T.abs_(y_pred4_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse4_val.name = 'mse4_val' mae4_val.name = 'mae4_val' theta_mu4_in_val = theta_mu4_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig4_in_val = theta_sig4_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff4_in_val = coeff4_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu4_in_val, theta_sig4_in_val, coeff4_in_val) totaMSE_val += mse4_val totaMAE_val += mae4_val indexSepDynamic_val += 2 if (y_dim > 4): theta_mu5_temp_val, theta_sig5_temp_val, coeff5_temp_val, y_pred5_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu5_temp_val.name = 'theta_mu5_val' theta_sig5_temp_val.name = 'theta_sig5_val' coeff5_temp_val.name = 'coeff5_val' y_pred5_temp_val.name = 'disaggregation5_val' y_pred5_temp_val = T.clip(y_pred5_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred5_temp_val], axis=2) # before it gets unnormalized mse5_val = T.mean((y_pred5_temp_val - y[:, :, 4].reshape( (y.shape[0], y.shape[1], 1)))**2) mae5_val = T.mean( T.abs_(y_pred5_temp_val - y[:, :, 4].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred5_temp_val) totReal = T.sum(y[:, :, 4]) relErr5_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned5_val = 1 - T.sum( T.abs_(y_pred5_temp_val - y[:, :, 4].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,4] * reader.stdTrain[4]) + reader.meanTrain[4] #y_pred5_temp_val = (y_pred5_temp_val * reader.stdTrain[4]) + reader.meanTrain[4] #mse5_valUnNorm = T.mean((y_pred5_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae5_valUnNorm = T.mean( T.abs_(y_pred5_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse5_val.name = 'mse5_val' mae5_val.name = 'mae5_val' theta_mu5_in_val = theta_mu5_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig5_in_val = theta_sig5_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff5_in_val = coeff5_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu5_in_val, theta_sig5_in_val, coeff5_in_val) totaMSE_val += mse5_val totaMAE_val += mae5_val indexSepDynamic_val += 2 if (y_dim > 5): theta_mu6_temp_val, theta_sig6_temp_val, coeff6_temp_val, y_pred6_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu6_temp_val.name = 'theta_mu6_val' theta_sig6_temp_val.name = 'theta_sig6_val' coeff6_temp_val.name = 'coeff6_val' y_pred6_temp_val.name = 'disaggregation6_val' y_pred6_temp_val = T.clip(y_pred6_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred6_temp_val], axis=2) #before it gets unnormalized mse6_val = T.mean((y_pred6_temp_val - y[:, :, 5].reshape( (y.shape[0], y.shape[1], 1)))**2) mae6_val = T.mean( T.abs_(y_pred6_temp_val - y[:, :, 5].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred6_temp_val) totReal = T.sum(y[:, :, 5]) relErr6_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned6_val = 1 - T.sum( T.abs_(y_pred6_temp_val - y[:, :, 5].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,5] * reader.stdTrain[5]) + reader.meanTrain[5] #y_pred6_temp_val = (y_pred6_temp_val * reader.stdTrain[5]) + reader.meanTrain[5] #mse6_valUnNorm = T.mean((y_pred6_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae6_valUnNorm = T.mean( T.abs_(y_pred6_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse6_val.name = 'mse6_val' mae6_val.name = 'mae6_val' theta_mu6_in_val = theta_mu6_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig6_in_val = theta_sig6_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff6_in_val = coeff6_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu6_in_val, theta_sig6_in_val, coeff6_in_val) totaMSE_val += mse6_val totaMAE_val += mae6_val indexSepDynamic_val += 2 if (y_dim > 6): theta_mu7_temp_val, theta_sig7_temp_val, coeff7_temp_val, y_pred7_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu7_temp_val.name = 'theta_mu7_val' theta_sig7_temp_val.name = 'theta_sig7_val' coeff7_temp_val.name = 'coeff7_val' y_pred7_temp_val.name = 'disaggregation7_val' y_pred7_temp_val = T.clip(y_pred7_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred7_temp_val], axis=2) # before it gets unnormalized mse7_val = T.mean((y_pred7_temp_val - y[:, :, 6].reshape( (y.shape[0], y.shape[1], 1)))**2) mae7_val = T.mean( T.abs_(y_pred7_temp_val - y[:, :, 6].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred7_temp_val) totReal = T.sum(y[:, :, 6]) relErr7_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned7_val = 1 - T.sum( T.abs_(y_pred7_temp_val - y[:, :, 6].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,6] * reader.stdTrain[6]) + reader.meanTrain[6] #y_pred7_temp_val = (y_pred7_temp_val * reader.stdTrain[6]) + reader.meanTrain[6] #mse7_valUnNorm = T.mean((y_pred7_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae7_valUnNorm = T.mean( T.abs_(y_pred7_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse7_val.name = 'mse7_val' mae7_val.name = 'mae7_val' theta_mu7_in_val = theta_mu7_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig7_in_val = theta_sig7_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff7_in_val = coeff7_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu7_in_val, theta_sig7_in_val, coeff7_in_val) totaMSE_val += mse7_val totaMAE_val += mae7_val indexSepDynamic_val += 2 if (y_dim > 7): theta_mu8_temp_val, theta_sig8_temp_val, coeff8_temp_val, y_pred8_temp_val = restResults_val[: 4] restResults_val = restResults_val[4:] theta_mu8_temp_val.name = 'theta_mu8_val' theta_sig8_temp_val.name = 'theta_sig8_val' coeff8_temp_val.name = 'coeff8_val' y_pred8_temp_val.name = 'disaggregation8_val' y_pred8_temp_val = T.clip(y_pred8_temp_val, 0.0, np.inf) prediction_val = T.concatenate([prediction_val, y_pred8_temp_val], axis=2) # before it gets unnormalized mse8_val = T.mean((y_pred8_temp_val - y[:, :, 7].reshape( (y.shape[0], y.shape[1], 1)))**2) mae8_val = T.mean( T.abs_(y_pred8_temp_val - y[:, :, 7].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred8_temp_val) totReal = T.sum(y[:, :, 7]) relErr8_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned8_val = 1 - T.sum( T.abs_(y_pred8_temp_val - y[:, :, 7].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,7] * reader.stdTrain[7]) + reader.meanTrain[7] #y_pred8_temp_val = (y_pred8_temp_val * reader.stdTrain[7]) + reader.meanTrain[7] #mse8_valUnNorm = T.mean((y_pred8_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae8_valUnNorm = T.mean( T.abs_(y_pred8_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))) ) mse8_val.name = 'mse8_val' mae8_val.name = 'mae8_val' theta_mu8_in_val = theta_mu8_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig8_in_val = theta_sig8_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff8_in_val = coeff8_temp_val.reshape((x_shape[0] * x_shape[1], -1)) argsGMM_val = argsGMM_val + (theta_mu8_in_val, theta_sig8_in_val, coeff8_in_val) totaMSE_val += mse8_val totaMAE_val += mae8_val indexSepDynamic_val += 2 recon_val = GMMdisagMulti( y_dim, y_in, theta_mu1_in_val, theta_sig1_in_val, coeff1_in_val, *argsGMM_val ) # BiGMM(x_in, theta_mu_in, theta_sig_in, coeff_in, corr_in, binary_in) recon_val = recon_val.reshape((x_shape[0], x_shape[1])) recon_val.name = 'gmm_out' totaMSE_val = totaMSE_val / y_dim totaMAE_val = totaMAE_val / y_dim recon_term_val = recon_val.sum(axis=0).mean() recon_term_val = recon_val.sum(axis=0).mean() recon_term_val.name = 'recon_term' ###################### optimizer = Adam(lr=lr) header = "epoch,log,kl,nll_upper_bound,mse,mae\n" lr_iterations = {0: lr} data = Iterator(test_data, batch_size) test_fn = theano.function( inputs=[x, y], #[x, y], #givens={x:Xtest}, #on_unused_input='ignore', #z=( ,200,1) allow_input_downcast=True, outputs=[ prediction_val, recon_term_val, totaMSE_val, totaMAE_val, mse1_val, mse2_val, mse3_val, mse4_val, mse5_val, mse6_val, mse7_val, mse8_val, mae1_val, mae2_val, mae3_val, mae4_val, mae5_val, mae6_val, mae7_val, mae8_val, #unnormalized mae and mse 16 items# relErr1_val, relErr2_val, relErr3_val, relErr4_val, relErr5_val, relErr6_val, relErr7_val, relErr8_val, propAssigned1_val, propAssigned2_val, propAssigned3_val, propAssigned4_val, propAssigned5_val, propAssigned6_val, propAssigned7_val, propAssigned8_val ], updates=updates_val) testOutput = [] testMetrics2 = [] perEnergyAssig = [] bestInstsancesPred = [] bestInstsancesDisa = [] bestInstsancesAggr = [] numBatchTest = 0 for batch in data: outputGeneration = test_fn(batch[0], batch[2]) testOutput.append( outputGeneration[1:20]) #before 36 including unnormalized metrics testMetrics2.append(outputGeneration[20:]) ########## best mae predTest = np.transpose(outputGeneration[0], [1, 0, 2]).clip(min=0) realTest = np.transpose(batch[2], [1, 0, 2]) batchMSE = np.mean(np.absolute(predTest - realTest), axis=(1, 2)) idxMin = np.argmin(batchMSE) #print(np.asarray(idxMin).reshape(1,-1)[0,:]) #print(batchMSE[idxMin]) for idx in np.asarray(idxMin).reshape(1, -1)[0, :]: plt.figure(1) plt.plot(predTest[idx]) plt.legend(appliances) plt.savefig( save_path + "/vrnn_disall_test-b{}_Pred_0-{}".format(numBatchTest, idx), format='eps') plt.clf() plt.figure(2) plt.plot(realTest[idx]) plt.legend(appliances) plt.savefig(save_path + "/vrnn_disall_test-b{}_RealDisag_0-{}".format( numBatchTest, idx), format='eps') plt.clf() plt.figure(3) plt.plot(np.transpose(batch[0], [1, 0, 2])[idx]) plt.savefig( save_path + "/vrnn_disall_test-b{}_Realagg_0-{}".format(numBatchTest, idx), format='eps') plt.clf() bestInstsancesPred.append(predTest[idx]) bestInstsancesDisa.append(realTest[idx]) bestInstsancesAggr.append(np.transpose(batch[0], [1, 0, 2])[idx]) numBatchTest += 1 sumNumPred = np.sum(predTest, axis=(0, 1)) sumNumReal = np.sum(batch[2], axis=(0, 1)) perEnergy = np.sum(batch[0], axis=(0, 1)) perEnergyAssig.append((sumNumReal / perEnergy, sumNumPred / perEnergy)) scipy.io.savemat(save_path + '/testInstances.mat', mdict={ 'pred': bestInstsancesPred, 'disag': bestInstsancesDisa, 'agg': bestInstsancesAggr }) testOutput = np.asarray(testOutput) testMetrics2 = np.asarray(testMetrics2) print(testOutput.shape) print(testMetrics2.shape) testOutput[:, 19:] = 1000 * testOutput[:, 19:] # kwtts a watts recon_test = testOutput[:, 0].mean() mse_test = testOutput[:, 1].mean() mae_test = testOutput[:, 2].mean() mse1_test = testOutput[:, 3].mean() mae1_test = testOutput[:, 11].mean() mse2_test = testOutput[:, 4].mean() mae2_test = testOutput[:, 12].mean() mse3_test = testOutput[:, 5].mean() mae3_test = testOutput[:, 13].mean() mse4_test = testOutput[:, 6].mean() mae4_test = testOutput[:, 14].mean() mse5_test = testOutput[:, 7].mean() mae5_test = testOutput[:, 15].mean() mse6_test = testOutput[:, 8].mean() mae6_test = testOutput[:, 16].mean() mse7_test = testOutput[:, 9].mean() mae7_test = testOutput[:, 17].mean() mse8_test = testOutput[:, 10].mean() mae8_test = testOutput[:, 18].mean() print(testOutput[:, 3:11].mean(), testOutput[:, 11:19].mean()) relErr1_test = testMetrics2[:, 0].mean() relErr2_test = testMetrics2[:, 1].mean() relErr3_test = testMetrics2[:, 2].mean() relErr4_test = testMetrics2[:, 3].mean() relErr5_test = testMetrics2[:, 4].mean() relErr6_test = testMetrics2[:, 5].mean() relErr7_test = testMetrics2[:, 6].mean() relErr8_test = testMetrics2[:, 7].mean() propAssigned1_test = testMetrics2[:, 8].mean() propAssigned2_test = testMetrics2[:, 9].mean() propAssigned3_test = testMetrics2[:, 10].mean() propAssigned4_test = testMetrics2[:, 11].mean() propAssigned5_test = testMetrics2[:, 12].mean() propAssigned6_test = testMetrics2[:, 13].mean() propAssigned7_test = testMetrics2[:, 14].mean() propAssigned8_test = testMetrics2[:, 15].mean() fLog = open(save_path + '/output.csv', 'w') fLog.write(str(lr_iterations) + "\n") fLog.write(str(appliances) + "\n") fLog.write(str(windows) + "\n\n") fLog.write( "logTest,mse1_test,mse2_test,mse3_test,mse4_test,mse5_test, mse6_test,mse7_test,mse8_test,mae1_test,mae2_test,mae3_test,mae4_test,mae5_test, mae6_test,mae7_test,mae8_test,mseTest,maeTest\n" ) #fLog.write("Unnorm,{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},0.0,0.0\n\n".format(mse1_valUnNorm,mse2_valUnNorm,mse3_valUnNorm,mse4_valUnNorm,mse5_valUnNorm, mse6_valUnNorm,mse7_valUnNorm,mse8_valUnNorm,mae1_valUnNorm,mae2_valUnNorm,mae3_valUnNorm,mae4_valUnNorm,mae5_valUnNorm, mae6_valUnNorm,mae7_valUnNorm,mae8_valUnNorm)) fLog.write( "{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n\n" .format(recon_test, mse1_test, mse2_test, mse3_test, mse4_test, mse5_test, mse6_test, mse7_test, mse8_test, mae1_test, mae2_test, mae3_test, mae4_test, mae5_test, mae6_test, mae7_test, mae8_test, mse_test, mae_test)) fLog.write( "relErr1,relErr2,relErr3,relErr4,relErr5,relErr6,relErr7,relErr8,propAssigned1,propAssigned2,propAssigned3,propAssigned4,propAssigned5\n" ) fLog.write("{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format( relErr1_test, relErr2_test, relErr3_test, relErr4_test, relErr5_test, relErr6_test, relErr7_test, relErr8_test, propAssigned1_test, propAssigned2_test, propAssigned3_test, propAssigned4_test, propAssigned5_test, propAssigned6_test, propAssigned7_test, propAssigned8_test)) fLog.write( "batch,perReal1,perReal2,perReal3,perReal4,perReal5,perReal6,perReal7,perReal8,perPredict1,perPredict2,perPredict3,perPredict4,perPredict5,perPredict6,perPredict7,perPredict8\n" ) for batch, item in enumerate(perEnergyAssig): fLog.write( "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format( batch, item[0][0], item[0][1], item[0][2], item[0][3], item[0][4], item[0][5], item[0][6], item[0][7], item[1][0], item[1][1], item[1][2], item[1][3], item[1][4], item[1][5], item[1][6], item[1][7])) fLog.write(pickleModel) f = open(save_path + '/outputRealGeneration.pkl', 'wb') pickle.dump(outputGeneration, f, -1) f.close()
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'): model_options = locals().copy() print "model options", model_options seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl') test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len) test_sents_l=np.asarray(test_sents_l, dtype='int32') test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX) test_sents_r=np.asarray(test_sents_r, dtype='int32') test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX) test_word1=np.asarray(test_word1, dtype='int32') test_word2=np.asarray(test_word2, dtype='int32') test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX) test_labels_store=np.asarray(test_labels, dtype='int32') test_extra=np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size=len(test_sents_l) print ' test size: ', test_size vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings]) #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1) word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para=[conv_W, conv_b,conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = word1_mask, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1) term_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_input_r, input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_input_r, mask_matrix = word1_mask, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1) def_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_input_l, input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = sents_mask_l, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1) def_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_input_r, input_tensor3=embed_input_l, input_tensor3_r = embed_input_r, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1) if p_mode == 'conc': p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4*4*hidden_size[1] else: p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0) p_len =4*hidden_size[1] # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) # HL_input_size=p_len+1+1 # # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh) "form input to LR classifier" LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) LR_input_size=p_len+1+1 # LR_input = HL_layer_1.output # LR_input_size = hidden_size[1] U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() params = NN_para+LR_para #[init_embeddings] # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params) load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params) ''' 0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424 0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538 ''' test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_test_batches=test_size/batch_size n_test_remain = test_size%batch_size if n_test_remain!=0: test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] else: test_batch_start=list(np.arange(n_test_batches)*batch_size) # max_acc_dev=0.0 max_ap_test=0.0 max_ap_topk_test=0.0 max_f1=0.0 pred_labels =[] probs = [] gold_labels =[] error_sum=0.0 for idd, test_batch_id in enumerate(test_batch_start): # for each test batch pred_i, prob_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_word1[test_batch_id:test_batch_id+batch_size], test_word2[test_batch_id:test_batch_id+batch_size], test_word1_mask[test_batch_id:test_batch_id+batch_size], test_word2_mask[test_batch_id:test_batch_id+batch_size], test_extra[test_batch_id:test_batch_id+batch_size]) # error_sum+=error_i pred_labels+=list(pred_i) probs+=list(prob_i) print len(test_sents_l), len(probs) if n_test_remain !=0: probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:] print len(test_sents_l), len(probs) assert len(test_sents_l) == len(probs) assert sum(group_size_list) == len(probs) #max prob in group max_probs = [] prior_size = 0 for i in range(len(group_size_list)): sub_probs = probs[prior_size:prior_size+group_size_list[i]] prior_size += group_size_list[i] max_probs.append(max(sub_probs)) print len(group_size_list),len(max_probs),len(test_labels) assert len(test_labels) == len(max_probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, max_probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, max_probs, k=100) # if test_ap > max_ap_test: # max_ap_test=test_ap # store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params) # if test_ap_top100 > max_ap_topk_test: # max_ap_topk_test=test_ap_top100 print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
def __init__(self, K, conv_layer_sizes, hidden_layer_sizes, gamma): self.K = K lr = np.float32(2.5e-4) mu = np.float32(0) decay = np.float32(0.99) eps = np.float32(1e-10) # inputs and targets X = T.ftensor4('X') G = T.fvector('G') actions = T.ivector('actions') # create the graph self.conv_layers = [] num_input_filters = 4 # number of filters / color channels for num_output_filters, filtersz, stride in conv_layer_sizes: layer = ConvLayer(num_input_filters, num_output_filters, filtersz, stride) self.conv_layers.append(layer) num_input_filters = num_output_filters ##### debug ##### # Z = X / 255.0 # j = 0 # for layer in self.conv_layers: # Z = layer.forward(Z) # out = Z # op = theano.function(inputs=[X], outputs=out, allow_input_downcast=True) # test = op(np.random.randn(1, 4, IM_SIZE, IM_SIZE)) # print("output size after conv %d: %s" % (j, test.shape)) # j += 1 # get conv output size Z = X / 255.0 for layer in self.conv_layers: Z = layer.forward(Z) conv_out = Z.flatten(ndim=2) conv_out_op = theano.function(inputs=[X], outputs=conv_out, allow_input_downcast=True) test = conv_out_op(np.random.randn(1, 4, IM_SIZE, IM_SIZE)) flattened_ouput_size = test.shape[1] # build fully connected layers self.layers = [] M1 = flattened_ouput_size for M2 in hidden_layer_sizes: layer = HiddenLayer(M1, M2) self.layers.append(layer) M1 = M2 # final layer layer = HiddenLayer(M1, K, lambda x: x) self.layers.append(layer) # collect params for copy self.params = [] for layer in (self.conv_layers + self.layers): self.params += layer.params # calculate final output and cost Z = conv_out for layer in self.layers: Z = layer.forward(Z) Y_hat = Z selected_action_values = Y_hat[T.arange(actions.shape[0]), actions] cost = T.mean((G - selected_action_values)**2) # create train function # we need to ensure cache is updated before parameter update # by creating a list of new_caches # and using them in the parameter update grads = T.grad(cost, self.params) caches = [theano.shared(np.ones_like(p.get_value())) for p in self.params] new_caches = [decay*c + (np.float32(1) - decay)*g*g for c, g in zip(caches, grads)] c_update = [(c, new_c) for c, new_c in zip(caches, new_caches)] g_update = [ (p, p - lr*g / T.sqrt(new_c + eps)) for p, new_c, g in zip(self.params, new_caches, grads) ] updates = c_update + g_update # compile functions self.train_op = theano.function( inputs=[X, G, actions], updates=updates, allow_input_downcast=True ) self.predict_op = theano.function( inputs=[X], outputs=Y_hat, allow_input_downcast=True )
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=1, activation_method="Sigmoid"): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.activation = T.nnet.sigmoid assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data # the data is presented as rasterized images self.x = T.matrix('x') # the labels are presented as 1D vector of [int] labels self.y = T.fvector('y') # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=self.activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LinearRegression(input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, l2=0, l1=0) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def RelationStackMaker(chips, params, graph=False, weighted=False, batched=False): if batched: emb_input = T.itensor3('emb_input') entities_tv = [T.fmatrix('enidx_'+str(i)).astype(theano.config.floatX) for i in range(params['num_entity'])] if graph: if weighted: masks = T.ftensor4('child_mask') else: masks = T.ftensor3('child_mask') else: masks = T.fmatrix('batch_mask') else: emb_input = T.imatrix('emb_input') entities_tv = [T.fvector('enidx_'+str(i)).astype(theano.config.floatX) for i in range(params['num_entity'])] if graph: if weighted: masks = T.ftensor3('child_mask') else: masks = T.fmatrix('child_mask') else: masks = None #print masks, type(masks), masks.ndim current_chip = Start(params['voc_size'], emb_input) print ('\n', 'Building Stack now', '\n', 'Start: ', params['voc_size'], 'out_tv dim:', current_chip.output_tv.ndim) instantiated_chips = stackLayers(chips, current_chip, params, entity_size=params['num_entity']) regularizable_params = computeLayers(instantiated_chips, current_chip, params, entities_input=entities_tv, mask=masks) ### Debug use: Get the attention co-efficiency and visualize. ### for c in instantiated_chips: if c[1].endswith('Entity_Att'): assert hasattr(c[0], 'att_wt_arry') assert hasattr(c[0], 'entity_tvs') attention_weights = c[0].att_wt_arry entity_tvs = c[0].entity_tvs current_chip = instantiated_chips[-1][0] if current_chip.output_tv.ndim == 2: pred_y = current_chip.output_tv #T.argmax(current_chip.output_tv, axis=1) else: pred_y = current_chip.output_tv #T.argmax(current_chip.output_tv) #, axis=1) gold_y = (current_chip.gold_y if hasattr(current_chip, 'gold_y') else None) # Show all parameters that would be needed in this system params_needed = calculate_params_needed(instantiated_chips) print ("Parameters Needed", params_needed) for k in params_needed: assert k in params, k print (k, params[k]) assert hasattr(current_chip, 'score') cost = current_chip.score #/ params['nsentences'] cost_arr = [cost] for layer in instantiated_chips[:-1]: if hasattr(layer[0], 'score'): print (layer[1]) cost += params['cost_coef'] * layer[0].score cost_arr.append(params['cost_coef'] * layer[0].score) grads = T.grad(cost, wrt=regularizable_params) #[params[k] for k in params if (hasattr(params[k], 'is_regularizable') and params[k].is_regularizable)]) print ('Regularizable parameters:') for k, v in params.items(): if hasattr(v, 'is_regularizable'): print (k, v, v.is_regularizable) if graph or batched: #return (emb_input, masks, entities_tv, attention_weights, entity_tvs, gold_y, pred_y, cost, grads, regularizable_params) return (emb_input, masks, entities_tv, gold_y, pred_y, cost, grads, regularizable_params) else: return (emb_input, entities_tv, gold_y, pred_y, cost, grads, regularizable_params)
def main(args): theano.optimizer = 'fast_compile' #theano.config.exception_verbosity='high' trial = int(args['trial']) pkl_name = 'dp_disall-sch_%d' % trial channel_name = 'mae' data_path = args['data_path'] save_path = args[ 'save_path'] #+'/aggVSdisag_distrib/'+datetime.datetime.now().strftime("%y-%m-%d_%H-%M") period = int(args['period']) n_steps = int(args['n_steps']) stride_train = int(args['stride_train']) stride_test = int(args['stride_test']) loadType = int(args['loadType']) flgMSE = int(args['flgMSE']) monitoring_freq = int(args['monitoring_freq']) epoch = int(args['epoch']) batch_size = int(args['batch_size']) x_dim = int(args['x_dim']) y_dim = int(args['y_dim']) z_dim = int(args['z_dim']) rnn_dim = int(args['rnn_dim']) k = int(args['num_k']) #a mixture of K Gaussian functions lr = float(args['lr']) origLR = lr debug = int(args['debug']) kSchedSamp = int(args['kSchedSamp']) typeActivFunc = args['typeActivFunc'] print "trial no. %d" % trial print "batch size %d" % batch_size print "learning rate %f" % lr print "saving pkl file '%s'" % pkl_name print "to the save path '%s'" % save_path print(str(windows)) q_z_dim = 500 p_z_dim = 500 p_x_dim = 500 x2s_dim = 200 y2s_dim = 200 z2s_dim = 200 lr_iterations = {0: lr} target_dim = k # As different appliances are separeted in theta_mu1, theta_mu2, etc... each one is just created from k different Gaussians model = Model() Xtrain, ytrain, Xval, yval, Xtest, ytest, reader = fetch_redd( data_path, windows, appliances, numApps=-1, period=period, n_steps=n_steps, stride_train=stride_train, stride_test=stride_test, trainPer=0.5, valPer=0.25, testPer=0.25, typeLoad=loadType, flgAggSumScaled=1, flgFilterZeros=1) print(Xtrain.shape, Xval.shape, Xtest.shape, ytrain.shape, yval.shape, ytest.shape) print("Mean ", reader.meanTraining) print("Std", reader.stdTraining) instancesPlot = {0: [4]} train_data = Redd( name='train', prep='normalize', cond=True, # False #path=data_path, inputX=Xtrain, labels=ytrain) X_mean = train_data.X_mean X_std = train_data.X_std valid_data = Redd( name='valid', prep='normalize', cond=True, # False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xval, labels=yval) test_data = Redd( name='valid', prep='normalize', cond=True, # False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xtest, labels=ytest) init_W = InitCell('rand') init_U = InitCell('ortho') init_b = InitCell('zeros') init_b_sig = InitCell('const', mean=0.6) x, mask, y, y_mask = train_data.theano_vars() scheduleSamplingMask = T.fvector('schedMask') x.name = 'x_original' if debug: x.tag.test_value = np.zeros((15, batch_size, x_dim), dtype=np.float32) temp = np.ones((15, batch_size), dtype=np.float32) temp[:, -2:] = 0. mask.tag.test_value = temp x_1 = FullyConnectedLayer(name='x_1', parent=['x_t'], parent_dim=[x_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) y_1 = FullyConnectedLayer(name='y_1', parent=['y_t'], parent_dim=[y_dim], nout=y2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_1 = FullyConnectedLayer(name='z_1', parent=['z_t'], parent_dim=[z_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) rnn = LSTM(name='rnn', parent=['x_1', 'z_1', 'y_1'], parent_dim=[x2s_dim, z2s_dim, y2s_dim], nout=rnn_dim, unit='tanh', init_W=init_W, init_U=init_U, init_b=init_b) phi_1 = FullyConnectedLayer(name='phi_1', parent=['x_1', 's_tm1', 'y_1'], parent_dim=[x2s_dim, rnn_dim, y2s_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_mu = FullyConnectedLayer(name='phi_mu', parent=['phi_1'], parent_dim=[q_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) phi_sig = FullyConnectedLayer(name='phi_sig', parent=['phi_1'], parent_dim=[q_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) prior_1 = FullyConnectedLayer(name='prior_1', parent=['x_1', 's_tm1'], parent_dim=[x2s_dim, rnn_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_mu = FullyConnectedLayer(name='prior_mu', parent=['prior_1'], parent_dim=[p_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) prior_sig = FullyConnectedLayer(name='prior_sig', parent=['prior_1'], parent_dim=[p_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_1 = FullyConnectedLayer(name='theta_1', parent=['z_1', 's_tm1'], parent_dim=[z2s_dim, rnn_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_mu1 = FullyConnectedLayer(name='theta_mu1', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit=typeActivFunc, init_W=init_W, init_b=init_b) if (y_dim > 1): theta_mu2 = FullyConnectedLayer(name='theta_mu2', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit=typeActivFunc, init_W=init_W, init_b=init_b) if (y_dim > 2): theta_mu3 = FullyConnectedLayer(name='theta_mu3', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit=typeActivFunc, init_W=init_W, init_b=init_b) if (y_dim > 3): theta_mu4 = FullyConnectedLayer(name='theta_mu4', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit=typeActivFunc, init_W=init_W, init_b=init_b) theta_sig1 = FullyConnectedLayer(name='theta_sig1', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) if (y_dim > 1): theta_sig2 = FullyConnectedLayer(name='theta_sig2', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) if (y_dim > 2): theta_sig3 = FullyConnectedLayer(name='theta_sig3', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) if (y_dim > 3): theta_sig4 = FullyConnectedLayer(name='theta_sig4', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) coeff1 = FullyConnectedLayer(name='coeff1', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) if (y_dim > 1): coeff2 = FullyConnectedLayer(name='coeff2', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) if (y_dim > 2): coeff3 = FullyConnectedLayer(name='coeff3', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) if (y_dim > 3): coeff4 = FullyConnectedLayer(name='coeff4', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) corr = FullyConnectedLayer(name='corr', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='tanh', init_W=init_W, init_b=init_b) binary = FullyConnectedLayer(name='binary', parent=['theta_1'], parent_dim=[p_x_dim], nout=1, unit='sigmoid', init_W=init_W, init_b=init_b) nodes = [ rnn, x_1, y_1, z_1, #dissag_pred, phi_1, phi_mu, phi_sig, prior_1, prior_mu, prior_sig, theta_1, theta_mu1, theta_sig1, coeff1 ] dynamicOutput = [None, None, None, None, None, None, None, None] if (y_dim > 1): nodes = nodes + [theta_mu2, theta_sig2, coeff2] dynamicOutput = dynamicOutput + [None, None, None, None ] #mu, sig, coef and pred if (y_dim > 2): nodes = nodes + [theta_mu3, theta_sig3, coeff3] dynamicOutput = dynamicOutput + [None, None, None, None] if (y_dim > 3): nodes = nodes + [theta_mu4, theta_sig4, coeff4] dynamicOutput = dynamicOutput + [None, None, None, None] params = OrderedDict() for node in nodes: if node.initialize() is not None: params.update(node.initialize()) params = init_tparams(params) s_0 = rnn.get_init_state(batch_size) x_1_temp = x_1.fprop([x], params) y_1_temp = y_1.fprop([y], params) output_fn = [s_0] + dynamicOutput output_fn_val = [s_0] + dynamicOutput[2:] print(len(output_fn), len(output_fn_val)) def inner_fn(x_t, y_t, scheduleSamplingMask, s_tm1): phi_1_t = phi_1.fprop([x_t, s_tm1, y_t], params) phi_mu_t = phi_mu.fprop([phi_1_t], params) phi_sig_t = phi_sig.fprop([phi_1_t], params) prior_1_t = prior_1.fprop([x_t, s_tm1], params) prior_mu_t = prior_mu.fprop([prior_1_t], params) prior_sig_t = prior_sig.fprop([prior_1_t], params) z_t = Gaussian_sample( phi_mu_t, phi_sig_t ) #in the original code it is gaussian. GMM is for the generation z_1_t = z_1.fprop([z_t], params) theta_1_t = theta_1.fprop([z_1_t, s_tm1], params) theta_mu1_t = theta_mu1.fprop([theta_1_t], params) theta_sig1_t = theta_sig1.fprop([theta_1_t], params) coeff1_t = coeff1.fprop([theta_1_t], params) ## prediction 1 y_pred = GMM_sampleY( theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t) tupleMulti = phi_mu_t, phi_sig_t, prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred if (y_dim > 1): theta_mu2_t = theta_mu2.fprop([theta_1_t], params) theta_sig2_t = theta_sig2.fprop([theta_1_t], params) coeff2_t = coeff2.fprop([theta_1_t], params) y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t) y_pred = T.concatenate([y_pred, y_pred2], axis=1) tupleMulti = tupleMulti + (theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2) if (y_dim > 2): theta_mu3_t = theta_mu3.fprop([theta_1_t], params) theta_sig3_t = theta_sig3.fprop([theta_1_t], params) coeff3_t = coeff3.fprop([theta_1_t], params) y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t) y_pred = T.concatenate([y_pred, y_pred3], axis=1) tupleMulti = tupleMulti + (theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3) if (y_dim > 3): theta_mu4_t = theta_mu4.fprop([theta_1_t], params) theta_sig4_t = theta_sig4.fprop([theta_1_t], params) coeff4_t = coeff4.fprop([theta_1_t], params) y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t) y_pred = T.concatenate([y_pred, y_pred4], axis=1) tupleMulti = tupleMulti + (theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4) #s_t = rnn.fprop([[x_t, z_1_t, y_t], [s_tm1]], params) if (scheduleSamplingMask == 1): s_t = rnn.fprop([[x_t, z_1_t, y_t], [s_tm1]], params) else: y_t_aux = y_1.fprop([y_pred], params) s_t = rnn.fprop([[x_t, z_1_t, y_t_aux], [s_tm1]], params) return (s_t, ) + tupleMulti #corr_temp, binary_temp (otherResults, updates) = theano.scan( fn=inner_fn, sequences=[x_1_temp, y_1_temp, scheduleSamplingMask], outputs_info=output_fn) #[s_0, (None)] s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp,\ theta_mu1_temp, theta_sig1_temp, coeff1_temp, y_pred1_temp = otherResults[:9] restResults = otherResults[9:] for k, v in updates.iteritems(): k.default_update = v #s_temp = concatenate([s_0[None, :, :], s_temp[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0 theta_mu1_temp.name = 'theta_mu1' theta_sig1_temp.name = 'theta_sig1' coeff1_temp.name = 'coeff1' y_pred1_temp.name = 'disaggregation1' #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1) mse1 = T.mean((y_pred1_temp - y[:, :, 0].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae1 = T.mean( T.abs_(y_pred1_temp - y[:, :, 0].reshape((y.shape[0], y.shape[1], 1)))) mse1.name = 'mse1' mae1.name = 'mae1' kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp) x_shape = x.shape y_shape = y.shape x_in = x.reshape((x_shape[0] * x_shape[1], -1)) y_in = y.reshape((y_shape[0] * y_shape[1], -1)) theta_mu1_in = theta_mu1_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig1_in = theta_sig1_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff1_in = coeff1_temp.reshape((x_shape[0] * x_shape[1], -1)) ddoutMSEA = [] ddoutYpreds = [y_pred1_temp] indexSepDynamic = 7 #plus two totalmse, totalmae totaMAE = T.copy(mae1) totaMSE = T.copy(mse1) mse2 = T.zeros((1, )) mae2 = T.zeros((1, )) mse3 = T.zeros((1, )) mae3 = T.zeros((1, )) mse4 = T.zeros((1, )) mae4 = T.zeros((1, )) if (y_dim > 1): theta_mu2_temp, theta_sig2_temp, coeff2_temp, y_pred2_temp = restResults[: 4] restResults = restResults[4:] theta_mu2_temp.name = 'theta_mu2' theta_sig2_temp.name = 'theta_sig2' coeff2_temp.name = 'coeff2' y_pred2_temp.name = 'disaggregation2' mse2 = T.mean((y_pred2_temp - y[:, :, 1].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae2 = T.mean( T.abs_(y_pred2_temp - y[:, :, 1].reshape((y.shape[0], y.shape[1], 1)))) mse2.name = 'mse2' mae2.name = 'mae2' theta_mu2_in = theta_mu2_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig2_in = theta_sig2_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff2_in = coeff2_temp.reshape((x_shape[0] * x_shape[1], -1)) argsGMM = theta_mu2_in, theta_sig2_in, coeff2_in ddoutMSEA = ddoutMSEA + [mse2, mae2] ddoutYpreds = ddoutYpreds + [y_pred2_temp] #totaMSE+=mse2 indexSepDynamic += 2 if (y_dim > 2): theta_mu3_temp, theta_sig3_temp, coeff3_temp, y_pred3_temp = restResults[: 4] restResults = restResults[4:] theta_mu3_temp.name = 'theta_mu3' theta_sig3_temp.name = 'theta_sig3' coeff3_temp.name = 'coeff3' y_pred3_temp.name = 'disaggregation3' mse3 = T.mean((y_pred3_temp - y[:, :, 2].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae3 = T.mean( T.abs_(y_pred3_temp - y[:, :, 2].reshape((y.shape[0], y.shape[1], 1)))) mse3.name = 'mse3' mae3.name = 'mae3' theta_mu3_in = theta_mu3_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig3_in = theta_sig3_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff3_in = coeff3_temp.reshape((x_shape[0] * x_shape[1], -1)) argsGMM = argsGMM + (theta_mu3_in, theta_sig3_in, coeff3_in) ddoutMSEA = ddoutMSEA + [mse3, mae3] ddoutYpreds = ddoutYpreds + [y_pred3_temp] #totaMSE+=mse3 indexSepDynamic += 2 if (y_dim > 3): theta_mu4_temp, theta_sig4_temp, coeff4_temp, y_pred4_temp = restResults[: 4] restResults = restResults[4:] theta_mu4_temp.name = 'theta_mu4' theta_sig4_temp.name = 'theta_sig4' coeff4_temp.name = 'coeff4' y_pred4_temp.name = 'disaggregation4' mse4 = T.mean((y_pred4_temp - y[:, :, 3].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae4 = T.mean( T.abs_(y_pred4_temp - y[:, :, 3].reshape((y.shape[0], y.shape[1], 1)))) mse4.name = 'mse4' mae4.name = 'mae4' theta_mu4_in = theta_mu4_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig4_in = theta_sig4_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff4_in = coeff4_temp.reshape((x_shape[0] * x_shape[1], -1)) argsGMM = argsGMM + (theta_mu4_in, theta_sig4_in, coeff4_in) ddoutMSEA = ddoutMSEA + [mse4, mae4] ddoutYpreds = ddoutYpreds + [y_pred4_temp] #totaMSE+=mse4 indexSepDynamic += 2 totaMSE = (mse1 + mse2 + mse3 + mse4) / y_dim totaMSE.name = 'mse' totaMAE = (mae1 + mae2 + mae3 + mae4) / y_dim totaMAE.name = 'mae' recon = GMMdisagMulti( y_dim, y_in, theta_mu1_in, theta_sig1_in, coeff1_in, *argsGMM ) # BiGMM(x_in, theta_mu_in, theta_sig_in, coeff_in, corr_in, binary_in) recon = recon.reshape((x_shape[0], x_shape[1])) recon.name = 'gmm_out' recon_term = recon.sum(axis=0).mean() recon_term = recon.sum(axis=0).mean() recon_term.name = 'recon_term' #kl_temp = kl_temp * mask kl_term = kl_temp.sum(axis=0).mean() kl_term.name = 'kl_term' #nll_upper_bound_0 = recon_term + kl_term #nll_upper_bound_0.name = 'nll_upper_bound_0' if (flgMSE == 1): nll_upper_bound = recon_term + kl_term + totaMSE else: nll_upper_bound = recon_term + kl_term nll_upper_bound.name = 'nll_upper_bound' ###################### model.inputs = [x, mask, y, y_mask, scheduleSamplingMask] model.params = params model.nodes = nodes optimizer = Adam(lr=lr) header = "epoch,log,kl,nll_upper_bound,mse,mae\n" extension = [ GradientClipping(batch_size=batch_size), EpochCount(epoch, save_path, header), Monitoring( freq=monitoring_freq, ddout=[ nll_upper_bound, recon_term, kl_term, totaMSE, totaMAE, mse1, mae1 ] + ddoutMSEA + ddoutYpreds, indexSep=indexSepDynamic, indexDDoutPlot=[13], # adding indexes of ddout for the plotting #, (6,y_pred_temp) instancesPlot=instancesPlot, #0-150 data=[Iterator(valid_data, batch_size)], savedFolder=save_path), Picklize(freq=monitoring_freq, path=save_path), EarlyStopping(freq=monitoring_freq, path=save_path, channel=channel_name), WeightNorm() ] mainloop = Training( name=pkl_name, data=Iterator(train_data, batch_size), model=model, optimizer=optimizer, cost=nll_upper_bound, outputs=[recon_term, kl_term, nll_upper_bound, totaMSE, totaMAE], n_steps=n_steps, extension=extension, lr_iterations=lr_iterations, k_speedOfconvergence=kSchedSamp) mainloop.run() ''' data=Iterator(test_data, batch_size) test_fn = theano.function(inputs=[x, y],#[x, y], #givens={x:Xtest}, #on_unused_input='ignore', #z=( ,200,1) allow_input_downcast=True, outputs=[prediction_val, recon_term_val, totaMSE_val, totaMAE_val, mse1_val,mse2_val,mse3_val,mse4_val, mae1_val,mae2_val,mae3_val,mae4_val, #unnormalized mae and mse 16 items# relErr1_val,relErr2_val,relErr3_val,relErr4_val, propAssigned1_val, propAssigned2_val,propAssigned3_val,propAssigned4_val], updates=updates_val ) testOutput = [] testMetrics2 = [] numBatchTest = 0 for batch in data: outputGeneration = test_fn(batch[0], batch[2]) testOutput.append(outputGeneration[1:12]) #before 36 including unnormalized metrics testMetrics2.append(outputGeneration[12:]) #{0:[4,20], 2:[5,10]} #if (numBatchTest==0): plt.figure(1) plt.plot(np.transpose(outputGeneration[0],[1,0,2])[4]) plt.savefig(save_path+"/vrnn_dis_generated{}_Pred_0-4".format(numBatchTest)) plt.clf() plt.figure(2) plt.plot(np.transpose(batch[2],[1,0,2])[4]) plt.savefig(save_path+"/vrnn_dis_generated{}_RealDisag_0-4".format(numBatchTest)) plt.clf() plt.figure(3) plt.plot(np.transpose(batch[0],[1,0,2])[4]) plt.savefig(save_path+"/vrnn_dis_generated{}_Realagg_0-4".format(numBatchTest)) plt.clf() numBatchTest+=1 testOutput = np.asarray(testOutput) testMetrics2 = np.asarray(testMetrics2) print(testOutput.shape) print(testMetrics2.shape) testOutput[:,19:] = 1000 * testOutput[:,19:] # kwtts a watts recon_test = testOutput[:, 0].mean() mse_test = testOutput[:, 1].mean() mae_test = testOutput[:, 2].mean() mse1_test = testOutput[:, 3].mean() mae1_test = testOutput[:, 7].mean() mse2_test = testOutput[:, 4].mean() mae2_test = testOutput[:, 8].mean() mse3_test = testOutput[:, 5].mean() mae3_test = testOutput[:, 9].mean() mse4_test = testOutput[:, 6].mean() mae4_test = testOutput[:, 10].mean() print(testOutput[:,3:11].mean(),testOutput[:,11:19].mean()) relErr1_test = testMetrics2[:,0].mean() relErr2_test = testMetrics2[:,1].mean() relErr3_test = testMetrics2[:,2].mean() relErr4_test = testMetrics2[:,3].mean() propAssigned1_test = testMetrics2[:, 8].mean() propAssigned2_test = testMetrics2[:, 9].mean() propAssigned3_test = testMetrics2[:, 10].mean() propAssigned4_test = testMetrics2[:, 11].mean() ''' fLog = open(save_path + '/output.csv', 'w') fLog.write(str(lr_iterations) + "\n") fLog.write(str(appliances) + "\n") fLog.write(str(windows) + "\n\n") fLog.write("q_z_dim,p_z_dim,p_x_dim,x2s_dim,y2s_dim,z2s_dim\n") fLog.write("{},{},{},{},{},{}\n".format(q_z_dim, p_z_dim, p_x_dim, x2s_dim, y2s_dim, z2s_dim)) fLog.write("epoch,log,kl,mse1,mse2,mse3,mse4,mae1,mae2,mae3,mae4\n") for i, item in enumerate(mainloop.trainlog.monitor['nll_upper_bound']): e, f, g, h, j, k, l, n, p, q, r, s, t, u = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ep = mainloop.trainlog.monitor['epoch'][i] a = mainloop.trainlog.monitor['recon_term'][i] b = mainloop.trainlog.monitor['kl_term'][i] d = mainloop.trainlog.monitor['mse1'][i] m = mainloop.trainlog.monitor['mae1'][i] if (y_dim > 1): e = mainloop.trainlog.monitor['mse2'][i] n = mainloop.trainlog.monitor['mae2'][i] if (y_dim > 2): f = mainloop.trainlog.monitor['mse3'][i] p = mainloop.trainlog.monitor['mae3'][i] if (y_dim > 3): g = mainloop.trainlog.monitor['mse4'][i] q = mainloop.trainlog.monitor['mae4'][i] fLog.write( "{:d},{:.2f},{:.2f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n" .format(ep, a, b, d, e, f, g, m, n, p, q))
def __init__( self, model, dataset, train, percept_preprocessor, action_map, base_dir, model_pickle_path, save_rate=100, epsilon=1, epsilon_anneal_frames=1000000, epsilon_end=0.1, discount_factor=0.8, k=4, ): # Validate and store parameters assert(model) self.model = model assert(dataset) self.dataset = dataset assert(train) self.train = train assert(percept_preprocessor) self.percept_preprocessor = percept_preprocessor assert(action_map and type(action_map) == dict) self.action_map = action_map assert(os.path.exists(base_dir)) self.base_dir = base_dir assert(os.path.exists(os.path.dirname(model_pickle_path))) self.model_pickle_path = model_pickle_path assert(save_rate > 0) self.save_rate = save_rate assert(discount_factor > 0) if (discount_factor >= 1): log.warning("Discount factor >= 1, learning may diverge.") self.discount_factor = discount_factor assert(epsilon >= 0 and epsilon <= 1) self.epsilon = epsilon assert(epsilon_anneal_frames >= 0) self.epsilon_anneal_frames = epsilon_anneal_frames assert(epsilon_end >= 0) self.epsilon_end = epsilon_end self.epsilon_annealing_rate = 0 if self.epsilon_anneal_frames > 0: self.epsilon_annealing_rate = float(self.epsilon - self.epsilon_end) self.epsilon_annealing_rate /= float(self.epsilon_anneal_frames) log.info('Epsilon annealing rate: %0.10f' % self.epsilon_annealing_rate) assert(k > 0) self.k = k self.train.dataset = self # How many actual actions does RL-Glue/ALE support? Can we query the available actions # for a given game and make this part more efficient? Using 20 for now. self.action_log = {i: 0 for i in range(20)} # Init helper member variables self.action_count = 0 self.reward = 0 # Accumulator for reward values # Init frame memory self.frame_memory = col.deque(maxlen=self.k) # Compile action function log.info('BASIC AGENT: Compiling action function...'), phi_eq = T.tensor4() q_eq = self.model.fprop(phi_eq) action_eq = T.argmax(q_eq, axis=1) self.action_func = function([phi_eq], action_eq) log.info('Done.') # Compile max q log.info('BASIC AGENT: Compiling y function...'), max_action_eq = T.max(q_eq, axis=1) self.max_q_func = function([phi_eq], max_action_eq) log.info('Done.') # Compile maximum action function log.info('BASIC AGENT: Compiling y function...'), r = T.fvector('r') gamma = T.fscalar('gamma') y = r + gamma*max_action_eq self.y_func = function([r, gamma, phi_eq], y) log.info('Done.')
def train_mlprnn(weight_path=sys.argv[1], file_name1=sys.argv[2], L1_reg=0.0, L2_reg=0.0000, path_name='/exports/work/inf_hcrc_cstr_udialogue/siva/data/'): voc_list = Vocabulary(path_name + 'train') voc_list.vocab_create() vocab = voc_list.vocab vocab_size = voc_list.vocab_size dataprovider_train = DataProvider(path_name + 'train', vocab, vocab_size) dataprovider_valid = DataProvider(path_name + 'valid', vocab, vocab_size) dataprovider_test = DataProvider(path_name + 'test', vocab, vocab_size) print '..building the model' #symbolic variables for input, target vector and batch index index = T.lscalar('index') x1 = T.fvector('x1') x2 = T.fvector('x2') x3 = T.fvector('x3') ht1 = T.fvector('ht1') y = T.ivector('y') learning_rate = T.fscalar('learning_rate') #theano shared variables for train, valid and test train_set_x1 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) train_set_x2 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) train_set_x3 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) train_set_y = theano.shared(numpy.empty((1), dtype='int32'), allow_downcast=True) valid_set_x1 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) valid_set_x2 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) valid_set_x3 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) valid_set_y = theano.shared(numpy.empty((1), dtype='int32'), allow_downcast=True) test_set_x1 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) test_set_x2 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) test_set_x3 = theano.shared(numpy.empty((1), dtype='float32'), allow_downcast=True) test_set_y = theano.shared(numpy.empty((1), dtype='int32'), allow_downcast=True) rng = numpy.random.RandomState() classifier = MLP_RNN(rng=rng, input1=x1, input2=x2, input3=x3, initial_hidden=ht1, n_in=vocab_size, fea_dim=int(sys.argv[3]), context_size=2, n_hidden=int(sys.argv[4]), n_out=vocab_size) hidden_state = theano.shared( numpy.empty((int(sys.argv[4]), ), dtype='float32')) cost = classifier.cost(y) #constructor for learning rate class learnrate_schedular = LearningRateNewBob(start_rate = 0.05, scale_by=.5, max_epochs=9999,\ min_derror_ramp_start=.01, min_derror_stop=.01, init_error=100.) log_likelihood = classifier.sum(y) likelihood = classifier.likelihood(y) #test_model test_model = theano.function(inputs = [], outputs = [log_likelihood, likelihood], \ givens = {x1: test_set_x1, x2: test_set_x2, x3: test_set_x3, ht1: hidden_state, y: test_set_y}) #validation_model validate_model = theano.function(inputs = [], outputs = [log_likelihood], \ givens = {x1: valid_set_x1, x2: valid_set_x2, x3: valid_set_x3, ht1: hidden_state, y: valid_set_y}) gradient_param = [] #calculates the gradient of cost with respect to parameters for param in classifier.params: gradient_param.append(T.cast(T.grad(cost, param), 'float32')) updates = [] #updates the parameters for param, gradient in zip(classifier.params, gradient_param): updates.append((param, param - learning_rate * gradient)) #training_model train_model = theano.function(inputs = [learning_rate], outputs = [cost, classifier.RNNhiddenlayer.output], updates = updates, \ givens = {x1: train_set_x1, x2: train_set_x2, x3: train_set_x3, ht1: hidden_state, y: train_set_y}) f = h5py.File(weight_path + file_name1, "r") for i in xrange(0, classifier.no_of_layers, 2): path_modified = '/' + 'MLP' + str(2) + '/layer' + str(i / 2) if i == 4: classifier.MLPparams[i].set_value(numpy.asarray(f[path_modified + "/W"].value, dtype='float32'), borrow=True) else: classifier.MLPparams[i].set_value(numpy.asarray(f[path_modified + "/W"].value, dtype='float32'), borrow=True) classifier.MLPparams[i + 1].set_value(numpy.asarray( f[path_modified + "/b"].value, dtype='float32'), borrow=True) f.close() print '.....training' best_valid_loss = numpy.inf start_time = time.time() while (learnrate_schedular.get_rate() != 0): print 'learning_rate:', learnrate_schedular.get_rate() print 'epoch_number:', learnrate_schedular.epoch frames_showed, progress = 0, 0 start_epoch_time = time.time() dataprovider_train.reset() for feats_lab_tuple in dataprovider_train: features, labels = feats_lab_tuple if labels is None or features is None: continue frames_showed += features.shape[0] for temp, i in zip(features, xrange(len(labels))): temp_features1 = numpy.zeros(vocab_size, dtype='float32') temp_features2 = numpy.zeros(vocab_size, dtype='float32') temp_features3 = numpy.zeros(vocab_size, dtype='float32') temp_features1[temp[0]] = 1 temp_features2[temp[1]] = 1 temp_features3[temp[1]] = 1 train_set_x1.set_value(numpy.asarray(temp_features1, dtype='float32'), borrow=True) train_set_x2.set_value(numpy.asarray(temp_features2, dtype='float32'), borrow=True) train_set_x3.set_value(numpy.asarray(temp_features2, dtype='float32'), borrow=True) train_set_y.set_value(numpy.asarray([labels[i]], dtype='int32'), borrow=True) out = train_model( numpy.array(learnrate_schedular.get_rate(), dtype='float32')) hidden_state.set_value(numpy.asarray(out[1], dtype='float32'), borrow=True) progress += 1 if progress % 10000 == 0: end_time_progress = time.time() print 'PROGRESS: Processed %i bunches (%i frames), TIME: %f in seconds'\ %(progress, frames_showed,(end_time_progress-start_epoch_time)) train_set_x1.set_value(numpy.empty((1), dtype='float32')) train_set_x2.set_value(numpy.empty((1), dtype='float32')) train_set_x3.set_value(numpy.empty((1), dtype='float32')) train_set_y.set_value(numpy.empty((1), dtype='int32')) end_time_progress = time.time() print 'PROGRESS: Processed %i bunches (%i frames), TIME: %f in seconds'\ %(progress, frames_showed,(end_time_progress-start_epoch_time)) print 'Validating...' valid_losses = [] log_likelihood = [] valid_frames_showed, progress = 0, 0 start_valid_time = time.time() # it is also stop of training time dataprovider_valid.reset() for feats_lab_tuple in dataprovider_valid: features, labels = feats_lab_tuple if labels is None or features is None: continue valid_frames_showed += features.shape[0] for temp, i in zip(features, xrange(len(labels))): temp_features1 = numpy.zeros(vocab_size, dtype='float32') temp_features2 = numpy.zeros(vocab_size, dtype='float32') temp_features3 = numpy.zeros(vocab_size, dtype='float32') temp_features1[temp[0]] = 1 temp_features2[temp[1]] = 1 temp_features3[temp[1]] = 1 valid_set_x1.set_value(numpy.asarray(temp_features1, dtype='float32'), borrow=True) valid_set_x2.set_value(numpy.asarray(temp_features2, dtype='float32'), borrow=True) valid_set_x3.set_value(numpy.asarray(temp_features3, dtype='float32'), borrow=True) valid_set_y.set_value(numpy.asarray([labels[i]], dtype='int32'), borrow=True) out = validate_model() #error_rate = out[0] likelihoods = out[0] #valid_losses.append(error_rate) log_likelihood.append(likelihoods) valid_set_x1.set_value(numpy.empty((1), 'float32')) valid_set_y.set_value(numpy.empty((1), 'int32')) progress += 1 if progress % 1000 == 0: end_time_valid_progress = time.time() print 'PROGRESS: Processed %i bunches (%i frames), TIME: %f in seconds'\ %(progress, valid_frames_showed, end_time_valid_progress - start_valid_time) end_time_valid_progress = time.time() print 'PROGRESS: Processed %i bunches (%i frames), TIME: %f in seconds'\ %(progress, valid_frames_showed, end_time_valid_progress - start_valid_time) #this_validation_loss = numpy.mean(valid_losses) entropy = (-numpy.sum(log_likelihood) / valid_frames_showed) print entropy, numpy.sum(log_likelihood) if entropy < best_valid_loss: learning_rate = learnrate_schedular.get_next_rate(entropy) best_valid_loss = entropy else: learnrate_schedular.rate = 0.0 end_time = time.time() print 'The fine tuning ran for %.2fm' % ((end_time - start_time) / 60.) print 'Testing...' log_likelihood = [] likelihoods = [] test_frames_showed, progress = 0, 0 start_test_time = time.time() # it is also stop of training time dataprovider_test.reset() for feats_lab_tuple in dataprovider_test: features, labels = feats_lab_tuple if labels is None or features is None: continue test_frames_showed += features.shape[0] for temp, i in zip(features, xrange(len(labels))): temp_features1 = numpy.zeros(vocab_size, dtype='float32') temp_features2 = numpy.zeros(vocab_size, dtype='float32') temp_features3 = numpy.zeros(vocab_size, dtype='float32') temp_features1[temp[0]] = 1 temp_features2[temp[1]] = 1 temp_features3[temp[1]] = 1 test_set_x1.set_value(numpy.asarray(temp_features1, dtype='float32'), borrow=True) test_set_x2.set_value(numpy.asarray(temp_features2, dtype='float32'), borrow=True) test_set_x3.set_value(numpy.asarray(temp_features3, dtype='float32'), borrow=True) test_set_y.set_value(numpy.asarray([labels[i]], dtype='int32'), borrow=True) out = test_model() log_likelihood.append(out[0]) likelihoods.append(out[1]) progress += 1 if progress % 1000 == 0: end_time_test_progress = time.time() print 'PROGRESS: Processed %i bunches (%i frames), TIME: %f in seconds'\ %(progress, test_frames_showed, end_time_test_progress - start_test_time) end_time_test_progress = time.time() print 'PROGRESS: Processed %i bunches (%i frames), TIME: %f in seconds'\ %(progress, test_frames_showed, end_time_test_progress - start_test_time) print numpy.sum(log_likelihood)
def __theano_build__(self): E, W, U, V = self.E, self.W, self.U, self.V x = T.fvector('x') y = T.fvector('y') # initial hidden vector initial_hidden_vector = np.zeros(self.hidden_dim) def calculate(x, h_t_prev, E, W, U, V): x_t = T.dot(E,x) P = U[0].dot(h_t_prev) z_t = T.nnet.sigmoid(T.dot(W[0], x_t) + U[0].dot(h_t_prev)) r_t = T.nnet.sigmoid(T.dot(W[1], x_t) + U[1].dot(h_t_prev)) _h_t = T.tanh(T.dot(W[2], x_t) + U[2].dot(h_t_prev * r_t)) h_t = (T.ones_like(z_t) - z_t) * h_t_prev + z_t * _h_t # softmax returns a matrix thith one row only # the row we want o_t = T.nnet.softmax(V.dot(h_t))[0][0] return [o_t, h_t_prev] [o, h] , updates = theano.scan( calculate, # outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], outputs_info=[None, initial_hidden_vector], non_sequences = [E, W, U, V], sequences=x, ) prediction = T.argmax(o, axis=0) prediction_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Total cost (Regularization can be done here) cost = prediction_error # gradients dE = T.grad(cost, E) dW = T.grad(cost, W) dU = T.grad(cost, U) dV = T.grad(cost, V) # assign functions self.predict = theano.function([x], o) self.prediction_class = theano.function([x], prediction) self.c_error = theano.function([x,y], cost) self.bptt = theano.function([x, y], [dW, dU, dV]) # SDG parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE ** 2 mU = decay * self.mU + (1 - decay) * dU ** 2 mW = decay * self.mW + (1 - decay) * dW ** 2 mV = decay * self.mV + (1 - decay) * dV ** 2 self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates = [ (E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), # (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV) ] )
#Pi model variables: if model.network_type=="pi": input_b_var = T.tensor3('inputs_b') mask_train=T.vector('mask_train') unsup_weight_var = T.scalar('unsup_weight') elif model.network_type=="tempens": #tempens model variables: z_target_var = T.matrix('z_targets') mask_train = T.vector('mask_train') unsup_weight_var = T.scalar('unsup_weight') learning_rate_var = T.scalar('learning_rate') adam_beta1_var = T.scalar('adam_beta1') #negative loss negative_loss_alpha=T.fvector("negative_loss_alpha") negative_loss_lamda=T.fscalar("negative_loss_lamda") #Keywords-attention input_root=T.fmatrix("input_root") input_e1=T.fmatrix("input_e1") input_e2=T.fmatrix("input_e2") """ 2. Bulit GRU network ADAM """ gru_network,l_in,l_mask,l_gru_forward,l_split_cnn=model.bulit_gru(input_var,mask_var,input_root,input_e1,input_e2) #mask_train_input: where "1" is pass. where "0" isn't pass.
def policy_network(state): input_state = InputLayer(input_var=state, shape=(None, n_input)) dense_1 = DenseLayer(input_state, num_units=n_input, nonlinearity=tanh) dense_2 = DenseLayer(dense_1, num_units=n_input, nonlinearity=tanh) probs = DenseLayer(dense_2, num_units=n_output, nonlinearity=softmax) return probs X_state = T.fmatrix() X_action = T.bvector() X_reward = T.fvector() X_action_hot = to_one_hot(X_action, n_output) prob_values = policy_network(X_state) policy_ = get_output(prob_values) policy = theano.function(inputs=[X_state], outputs=policy_, allow_input_downcast=True) loss = categorical_crossentropy(policy_, X_action_hot) * X_reward loss = loss.mean() params = get_all_params(prob_values)
def test_cudnn_softmax_grad(self): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) def cmp(n, m, f, f_gpu): data = numpy.arange(n * m, dtype='float32').reshape(n, m) gdata = numpy.asarray(data)[:, :, None, None] out = f(data) gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] assert numpy.allclose(out, gout), numpy.absolute(out - gout) x = T.matrix('x', 'float32') x_gpu = T.tensor4('x_gpu', 'float32') f_z = T.nnet.softmax f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax('bc01', 'accurate', 'channel') # Verify the grad operation dims = (2, 3, 4, 5) gdata = numpy.arange(numpy.product(dims), dtype='float32').reshape(dims) T.verify_grad(f_gpu, [gdata], rng=numpy.random, mode=mode_with_gpu) def check_types(graph, graph_gpu): self._check_types(graph, graph_gpu, -1, type(f_z), theano.sandbox.cuda.dnn.GpuDnnSoftmax) def check_types_opt(graph, graph_gpu): assert isinstance(graph.maker.fgraph.toposort()[-1].op, type(f_z)) assert len([ n for n in graph_gpu.maker.fgraph.toposort() if isinstance(n.op, theano.sandbox.cuda.dnn.GpuDnnSoftmax) ]) == 1 # Verify that the CPU and GPU implementations return the same results # up to a tolerance. self._test_softmax(x, x_gpu, f_z, f_gpu, cmp, mode_with_gpu, check_types) mode_w_cudnn = mode_with_gpu.including("cudnn") self._test_softmax(x, x, f_z, f_z, self._cmp, mode_w_cudnn, check_types_opt) # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is # applied when cudnn is required y = T.fvector('y') f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert (len([ i for i in sorted_f if isinstance(i.op, theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad) ]) == 1) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 0) # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is not # applied when cudnn is excluded or not available mode_wo_cudnn = mode_with_gpu.excluding("cudnn") y = T.fvector('y') f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_wo_cudnn) sorted_f = f.maker.fgraph.toposort() assert (len([ i for i in sorted_f if isinstance(i.op, theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad) ]) == 0) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 1) # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not # crash with manual graph y = T.fvector('y') o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2) f = theano.function([y], o, mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert (len([ i for i in sorted_f if isinstance(i.op, theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad) ]) == 1) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 0)
def MultitaskRelationStackMaker(Shared, Classifiers, params, num_tasks, graph=False, weighted=False, batched=False): if batched: emb_inputs = [T.itensor3('emb_input_'+str(i)) for i in range(num_tasks)] entities_tv = [[T.fmatrix('enidx_'+str(j)+'_t_'+str(i)) for j in range(params['num_entity_d'+str(i)])] for i in range(num_tasks)] if graph: if weighted: masks = [T.ftensor4('child_mask_d'+str(i)) for i in range(num_tasks)] else: masks = [T.ftensor3('child_mask_d'+str(i)) for i in range(num_tasks)] else: masks = [T.fmatrix('batch_mask_d'+str(i)) for i in range(num_tasks)] else: emb_inputs = [T.imatrix('emb_input_'+str(i)) for i in range(num_tasks)] entities_tv = [[T.fvector('enidx_'+str(j)+'_t_'+str(i)) for j in range(params['num_entity_d'+str(i)])] for i in range(num_tasks)] if graph: if weighted: masks = [T.ftensor3('child_mask_d'+str(i)) for i in range(num_tasks)] else: masks = [T.fmatrix('child_mask_d'+str(i)) for i in range(num_tasks)] else: masks = None current_chip = Start(params['voc_size'], None) instantiated_chips = stackLayers(Shared, current_chip, params) print ('Building Classifiers for tasks, input dim:', current_chip.out_dim) pred_ys = [] gold_ys = [] costs_arr = [] grads_arr = [] regularizable_param_arr = [] global_regularizable_params = [] for i, clsfier in enumerate(Classifiers): #feature_size = len(params['features2idx_dicts'][i]) #params['feature_size_'+str(i)] current_chip = instantiated_chips[-1][0] decoder_chips = stackLayers(clsfier, current_chip, params, entity_size=params['num_entity_d'+str(i)]) ## Note: this implementation only uses the LSTM hidden layer temp_chips = instantiated_chips + decoder_chips init_chip = Start(params['voc_size'], emb_inputs[i]) if batched: regularizable_params = computeLayers(temp_chips, init_chip, params, entities_input=entities_tv[i], mask=masks[i]) else: regularizable_params = computeLayers(temp_chips, init_chip, params, entities_input=entities_tv[i]) global_regularizable_params.extend(regularizable_params) regularizable_param_arr.append(regularizable_params) #task_chips.append(temp_chips) current_chip = temp_chips[-1][0] if current_chip.output_tv.ndim == 2: pred_ys.append(current_chip.output_tv) #T.argmax(current_chip.output_tv, axis=1)) else: pred_ys.append(current_chip.output_tv) #T.argmax(current_chip.output_tv, axis=0)) gold_ys.append(current_chip.gold_y) assert hasattr(current_chip, 'score') cost = current_chip.score costs_arr.append(cost) #/params['nsentences'] grads_arr.append( T.grad(cost, wrt=regularizable_params) ) # Show all parameters that would be needed in this system params_needed = ['voc_size', 'feature_size_'+str(i)] params_needed += calculate_params_needed(temp_chips) #cost = sum(costs_arr) #global_regularizable_params = list(set(global_regularizable_params)) #grads = T.grad(cost, # wrt=global_regularizable_params) print ('The joint model regularizable parameters:') for k, v in params.items(): if hasattr(v, 'is_regularizable'): print (k, v, v.is_regularizable) #return (emb_inputs, entities_tv, gold_ys, pred_ys, costs_arr, cost, grads_arr, grads, regularizable_param_arr, global_regularizable_params) if batched or graph: return (emb_inputs, entities_tv, masks, gold_ys, pred_ys, costs_arr, grads_arr, regularizable_param_arr) else: return (emb_inputs, entities_tv, gold_ys, pred_ys, costs_arr, grads_arr, regularizable_param_arr)
def fit(self): #if self.batch_size is not None: index = T.lscalar('index') # create shared data-sets in case of mini-batch train_X = self.shared_dataset(self.X_dat) train_y = self.shared_dataset(self.y_dat) test_X = self.shared_dataset(self.X_test) if self.batch_size is not None: n_train_batches = train_X.get_value( borrow=True).shape[0] / self.batch_size n_test_batches = test_X.get_value( borrow=True).shape[0] / self.batch_size X = T.matrix() if self.linear_regression: Y = T.fvector() else: Y = T.matrix() if self.linear_regression: self.w = self.initialize_weights( (self.X_dat.shape[1]), self.X_dat.shape[1], 1, self.weights_initialization ) # initialize weights for the parameters ( linear regression ) if self.add_bias: self.b = theano.shared( np.asarray(0, dtype=theano.config.floatX) ) # initialize bias to zero ( linear regression -- a single value ) py_x = T.dot( X, self.w) + self.b # get predictions for linear regression else: py_x = T.dot(X, self.w) else: self.w = self.initialize_weights( (self.X_dat.shape[1], self.y_dat.shape[1]), self.X_dat.shape[1], self.y_dat.shape[1], self.weights_initialization ) # initialize weights for the parameters ( logistic regression ) if self.add_bias: self.b = theano.shared( np.zeros((self.y_dat.shape[1], ), dtype=theano.config.floatX) ) # initialize bias to zeros ( logistic regression -- a numpy array ) py_x = T.nnet.softmax(T.dot(X, self.w) + self.b) # get probability predictions else: py_x = T.nnet.softmax(T.dot(X, self.w)) cost = T.mean( self.objectives(py_x, Y, self.objective, self.X_dat.shape[0])) # objective function if self.L1 > 0.0 or self.L2 > 0.0: # L1, L2 regularization [ when both used then 'elastic-net' ] if self.add_bias: reg_param_L1 = abs(T.sum(self.w) + T.sum(self.b)) # L1 regrularization reg_param_L2 = T.sum(T.sqr(self.w)) + T.sum(T.sqr( self.b)) # L2 regularization cost = cost + self.L1 * reg_param_L1 + self.L2 * reg_param_L2 else: reg_param_L1 = abs(T.sum(self.w)) # L1 regrularization reg_param_L2 = T.sum(T.sqr(self.w)) # L2 regularization cost = cost + self.L1 * reg_param_L1 + self.L2 * reg_param_L2 if self.add_bias: Params = [self.w, self.b] else: Params = [self.w] if self.batch_size is None: train = theano.function( inputs=[index], outputs=cost, updates=Optimizers_update(cost, Params, self.learning_rate, self.optimizer).run_optimizer(), givens={ X: train_X[0:index], Y: train_y[0:index] }, allow_input_downcast=True ) # Compile [ call external class Optimizers_update ] predict_valid = theano.function(inputs=[index], outputs=py_x, givens={X: test_X[0:index]}, allow_input_downcast=True) else: train = theano.function( inputs=[index], outputs=cost, updates=Optimizers_update(cost, Params, self.learning_rate, self.optimizer).run_optimizer(), givens={ X: train_X[index * self.batch_size:(index + 1) * self.batch_size], Y: train_y[index * self.batch_size:(index + 1) * self.batch_size] }, allow_input_downcast=True) predict_valid = theano.function( inputs=[index], outputs=py_x, givens={ X: test_X[index * self.batch_size:(index + 1) * self.batch_size] }, allow_input_downcast=True ) # prediction function for validation set self.predict = theano.function(inputs=[X], outputs=py_x) # predictions function early_stopping = [] # early stopping consecutive_increases_OR_decreases = 0 for i in range(self.iters): if self.batch_size is None: cost_train = train(self.X_dat.shape[0]) if self.custom_eval is None: cost_valid = self.evaluate_early_stopping( self.Y_test, self.predict(self.X_test), self.linear_regression) else: cost_valid = self.custom_eval[0](self.Y_test, self.predict(self.X_test)) else: for batch_index_train in range(n_train_batches): cost_train = train(batch_index_train) if self.custom_eval is None: cost_valid = np.mean([ self.evaluate_early_stopping( self.Y_test[batch_index_test * self.batch_size:(batch_index_test + 1) * self.batch_size], predict_valid(batch_index_test), self.linear_regression) for batch_index_test in range(n_test_batches) ]) else: cost_valid = np.mean([ self.custom_eval[0]( self.Y_test[batch_index_test * self.batch_size:(batch_index_test + 1) * self.batch_size], predict_valid(batch_index_test)) for batch_index_test in range(n_test_batches) ]) try: if self.custom_eval is None: print 'iter', str(i + 1), ' train_loss ', str( np.round(cost_train, 3)), ' test_loss ', str( np.round(cost_valid, 3)) else: print 'iter', str(i + 1), ' train_loss ', str( np.round( cost_train, 3)), ' test_' + self.custom_eval[1], ' ', str( np.round(cost_valid, 3)) except: ValueError early_stopping.append(cost_valid) if not self.maximize: change_sign = len(early_stopping) >= 2 and early_stopping[ -1] > early_stopping[-2] increase = 'increases' else: change_sign = len(early_stopping) >= 2 and early_stopping[ -1] < early_stopping[-2] decrease = 'decreases' if change_sign: consecutive_increases_OR_decreases += 1 else: consecutive_increases_OR_decreases = 0 if (consecutive_increases_OR_decreases >= self.early_stopping_rounds): if not self.maximize: print 'regression stopped after ', str( consecutive_increases_OR_decreases ), ' consecutive ', increase, ' of loss and ', str( i + 1), ' Epochs' break else: print 'regression stopped after ', str( consecutive_increases_OR_decreases ), ' consecutive ', decrease, ' of loss and ', str( i + 1), ' Epochs' break if np.isinf(cost_valid) or np.isnan(cost_valid): print 'Inf or nan values present after', str(i), 'Epochs' break
def model_eval(get_scores): entityPairs = T.fmatrix() entities = T.fmatrix() relations = T.fmatrix() testData_DM = T.imatrix() testData_MF = T.imatrix() entity_oov_embedding = T.fvector() entityPair_oov_embedding = T.fvector() normalize_eval = T.iscalar() normalize = T.iscalar() ''' for a given (e1, ?): we can partition the filtered candidate e2s into: 1) e2s such that (e1,e2) is trained -> allowedEP_MF ''' allowedEP_MF = theano.typed_list.TypedListType(T.ivector)() set1_e2 = theano.typed_list.TypedListType(T.ivector)() set2_e2 = theano.typed_list.TypedListType(T.ivector)() set3_e2 = T.ivector() oov_flag_e1_DM = T.ivector() oov_flag_e2_DM = T.ivector() oov_flags_MF = T.ivector() nnet_W1 = T.fmatrix() nnet_W2 = T.fmatrix() nnet_W3 = T.fmatrix() nnet_b1 = T.fvector() nnet_b2 = T.fvector() nnet_b3 = T.fvector() aux_features = T.fmatrix() layers = [(nnet_W1, nnet_b1), (nnet_W2, nnet_b2), (nnet_W3, nnet_b3)] normalize_DM_W1 = T.fmatrix() normalize_DM_b1 = T.fvector() normalize_MF_W1 = T.fmatrix() normalize_MF_b1 = T.fvector() layers_normalize_DM = [(normalize_DM_W1, normalize_DM_b1)] layers_normalize_MF = [(normalize_MF_W1, normalize_MF_b1)] def MF_fn(testPoint_DM, testPoint_MF, i, oov_flag_e1, oov_flag_e2, oov_flag, entityPairs, entities, relations, entityPair_oov_embedding, entity_oov_embedding, allowed_entityPair, set1_e2, set2_e2, set3_e2, normalize_eval, normalize): # score of allowed e2s scores_MF = T.tanh( T.dot(entityPairs[allowed_entityPair[i]], relations[testPoint_MF[0]])) #scores_MF = T.dot(entityPairs[allowed_entityPair[i]], relations[testPoint_MF[0]]) # score for oov (e1,e2)s score_oov_MF = T.tanh( T.dot(entityPair_oov_embedding, relations[testPoint_MF[0]])) #score_oov_MF = T.dot(entityPair_oov_embedding,relations[testPoint_MF[0]]) score_nonOOV_MF = T.tanh( T.dot(entityPairs[testPoint_MF[1]], relations[testPoint_MF[0]])) #score_nonOOV_MF = T.dot(entityPairs[testPoint_MF[1]], relations[testPoint_MF[0]]) # based on whether (e1,e2) is OOV pick the score for the current testPoint score_testPoint_MF = T.switch(oov_flag, score_oov_MF, score_nonOOV_MF) e1_fact_embedding = T.switch(oov_flag_e1, entity_oov_embedding, entities[testPoint_DM[0]]) e2_fact_embedding = T.switch(oov_flag_e2, entity_oov_embedding, entities[testPoint_DM[2]]) # score of allowed e2s -> (e1,e2) seen -> e2 seen scores_DM = T.tanh( T.dot(e1_fact_embedding * entities[set1_e2[i]], relations[testPoint_DM[1]])) #scores_DM = T.dot(e1_fact_embedding*entities[set1_e2[i]], relations[testPoint_DM[1]]) # score for the test point score_testPoint_DM = T.tanh( T.dot(relations[testPoint_DM[1]], e1_fact_embedding * e2_fact_embedding)) #score_testPoint_DM = T.dot(relations[testPoint_DM[1]], e1_fact_embedding*e2_fact_embedding) score_oov_DM = T.tanh( T.dot(relations[testPoint_DM[1]], e1_fact_embedding * entity_oov_embedding)) #score_oov_DM = T.dot(relations[testPoint_DM[1]], e1_fact_embedding*entity_oov_embedding) # score for e2s such that (e1,e2) non seen but e2 non OOV. scores_DM_set2 = T.tanh( T.dot(e1_fact_embedding * entities[set2_e2[i]], relations[testPoint_DM[1]])) #scores_DM_set2 = T.dot(e1_fact_embedding*entities[set2_e2[i]], relations[testPoint_DM[1]]) #Normalize scores using pretrained weights scores_MF = T.switch( normalize, get_normalized_scores(layers_normalize_MF, scores_MF), scores_MF) score_testPoint_MF = T.switch( normalize, get_normalized_scores(layers_normalize_MF, score_testPoint_MF), score_testPoint_MF) score_oov_MF = T.switch( normalize, get_normalized_scores(layers_normalize_MF, score_oov_MF), score_oov_MF) scores_DM = T.switch( normalize, get_normalized_scores(layers_normalize_DM, scores_DM), scores_DM) score_testPoint_DM = T.switch( normalize, get_normalized_scores(layers_normalize_DM, score_testPoint_DM), score_testPoint_DM) score_oov_DM = T.switch( normalize, get_normalized_scores(layers_normalize_DM, score_oov_DM), score_oov_DM) scores_DM_set2 = T.switch( normalize, get_normalized_scores(layers_normalize_DM, scores_DM_set2), scores_DM_set2) #DM and MF score normalization mean_DM, std_DM = get_data_stats( T.concatenate([scores_DM, scores_DM_set2, T.stack([score_oov_DM])])) scores_DM = T.switch(normalize_eval, normalize_data(scores_DM, mean_DM, std_DM), scores_DM) mean_MF, std_MF = get_data_stats( T.concatenate([scores_MF, T.stack([score_oov_MF])])) scores_MF = T.switch(normalize_eval, normalize_data(scores_MF, mean_MF, std_MF), scores_MF) score_oov_DM = T.switch(normalize_eval, normalize_data(score_oov_DM, mean_DM, std_DM), score_oov_DM) score_oov_MF = T.switch(normalize_eval, normalize_data(score_oov_MF, mean_MF, std_MF), score_oov_MF) score_testPoint_MF = T.switch(normalize_eval, (score_testPoint_MF - mean_MF) / std_MF, score_testPoint_MF) score_testPoint_DM = T.switch(normalize_eval, (score_testPoint_DM - mean_DM) / std_DM, score_testPoint_DM) # score_testPoint, scores_set1, scores_set2, scores_set3, f1 = get_scores( layers, aux_features[i], [scores_MF, scores_DM], [T.stack(score_oov_MF), scores_DM_set2], [score_oov_MF, score_oov_DM], [score_testPoint_MF, score_testPoint_DM]) rank = 1 + T.sum(scores_set1 > score_testPoint) + T.sum( scores_set2 > score_testPoint) oov_comparison = score_testPoint < scores_set3 rank = T.switch(oov_comparison, rank + set3_e2[i], rank) rank = T.switch(oov_flag_e2, rank + (set3_e2[i] / 2.0), rank) same = T.sum(T.eq(scores_set1, score_testPoint)) + T.sum( T.eq(scores_set2, score_testPoint)) rank += same / 2.0 same = same / (scores_set1.shape[0] + scores_set2.shape[0] * 1.0) ''' dataStats = T.concatenate([get_data_stats(T.concatenate([scores_set1,scores_set2])), get_data_stats(scores_MF), get_data_stats(T.concatenate([scores_DM,scores_DM_set2]))]) oov_scores = T.stack([score_oov_MF, score_oov_DM]) return rank, f1, score_testPoint_DM, score_testPoint_MF, dataStats, oov_scores ''' return rank, f1, score_testPoint_DM, score_testPoint_MF, same * 100.0 ranks, ignore = theano.scan(MF_fn, non_sequences=[ entityPairs, entities, relations, entityPair_oov_embedding, entity_oov_embedding, allowedEP_MF, set1_e2, set2_e2, set3_e2, normalize_eval, normalize ], sequences=[ testData_DM, testData_MF, theano.tensor.arange(testData_DM.shape[0]), oov_flag_e1_DM, oov_flag_e2_DM, oov_flags_MF ]) f = theano.function([ normalize_eval, normalize, entityPairs, entities, relations, entityPair_oov_embedding, entity_oov_embedding, testData_DM, testData_MF, allowedEP_MF, set1_e2, set2_e2, oov_flag_e1_DM, oov_flag_e2_DM, oov_flags_MF, set3_e2, aux_features, nnet_W1, nnet_b1, nnet_W2, nnet_b2, nnet_W3, nnet_b3, normalize_DM_W1, normalize_DM_b1, normalize_MF_W1, normalize_MF_b1 ], ranks, allow_input_downcast=True) return f
def main(args): theano.optimizer = 'fast_compile' #theano.config.exception_verbosity='high' trial = int(args['trial']) pkl_name = 'vrnn_gmm_%d' % trial channel_name = 'nll_upper_bound' data_path = args['data_path'] save_path = args[ 'save_path'] #+'/aggVSdisag_distrib/'+datetime.datetime.now().strftime("%y-%m-%d_%H-%M") period = int(args['period']) n_steps = int(args['n_steps']) stride_train = int(args['stride_train']) stride_test = n_steps typeLoad = int(args['typeLoad']) flgMSE = int(args['flgMSE']) monitoring_freq = int(args['monitoring_freq']) epoch = int(args['epoch']) batch_size = int(args['batch_size']) x_dim = int(args['x_dim']) y_dim = int(args['y_dim']) z_dim = int(args['z_dim']) rnn_dim = int(args['rnn_dim']) k = int(args['num_k']) #a mixture of K Gaussian functions lr = float(args['lr']) origLR = lr debug = int(args['debug']) print "trial no. %d" % trial print "batch size %d" % batch_size print "learning rate %f" % lr print "saving pkl file '%s'" % pkl_name print "to the save path '%s'" % save_path q_z_dim = 350 p_z_dim = 400 p_x_dim = 450 x2s_dim = 400 y2s_dim = 200 z2s_dim = 350 target_dim = k # As different appliances are separeted in theta_mu1, theta_mu2, etc... each one is just created from k different Gaussians model = Model() Xtrain, ytrain, Xval, yval, Xtest, ytest, reader = fetch_ukdale( data_path, windows, appliances, numApps=-1, period=period, n_steps=n_steps, stride_train=stride_train, stride_test=stride_test, flgAggSumScaled=1, flgFilterZeros=1, typeLoad=typeLoad, trainPer=0.5, valPer=0.25, testPer=0.25) instancesPlot = {0: [5]} #instancesPlot = reader.build_dict_instances_plot(listDates, batch_size, Xval.shape[0]) train_data = UKdale( name='train', prep='normalize', cond=True, # False #path=data_path, inputX=Xtrain, labels=ytrain) X_mean = train_data.X_mean X_std = train_data.X_std valid_data = UKdale( name='valid', prep='normalize', cond=True, # False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xval, labels=yval) test_data = UKdale( name='valid', prep='normalize', cond=True, # False #path=data_path, X_mean=X_mean, X_std=X_std, inputX=Xtest, labels=ytest) init_W = InitCell('rand') init_U = InitCell('ortho') init_b = InitCell('zeros') init_b_sig = InitCell('const', mean=0.6) x, mask, y, y_mask = train_data.theano_vars() scheduleSamplingMask = T.fvector('schedMask') x.name = 'x_original' if debug: x.tag.test_value = np.zeros((15, batch_size, x_dim), dtype=np.float32) temp = np.ones((15, batch_size), dtype=np.float32) temp[:, -2:] = 0. mask.tag.test_value = temp x_1 = FullyConnectedLayer(name='x_1', parent=['x_t'], parent_dim=[x_dim], nout=x2s_dim, unit='relu', init_W=init_W, init_b=init_b) y_1 = FullyConnectedLayer(name='y_1', parent=['y_t'], parent_dim=[y_dim], nout=y2s_dim, unit='relu', init_W=init_W, init_b=init_b) z_1 = FullyConnectedLayer(name='z_1', parent=['z_t'], parent_dim=[z_dim], nout=z2s_dim, unit='relu', init_W=init_W, init_b=init_b) rnn = LSTM(name='rnn', parent=['x_1', 'z_1', 'y_1'], parent_dim=[x2s_dim, z2s_dim, y2s_dim], nout=rnn_dim, unit='tanh', init_W=init_W, init_U=init_U, init_b=init_b) phi_1 = FullyConnectedLayer(name='phi_1', parent=['x_1', 's_tm1', 'y_1'], parent_dim=[x2s_dim, rnn_dim, y2s_dim], nout=q_z_dim, unit='relu', init_W=init_W, init_b=init_b) phi_mu = FullyConnectedLayer(name='phi_mu', parent=['phi_1'], parent_dim=[q_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) phi_sig = FullyConnectedLayer(name='phi_sig', parent=['phi_1'], parent_dim=[q_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) prior_1 = FullyConnectedLayer(name='prior_1', parent=['x_1', 's_tm1'], parent_dim=[x2s_dim, rnn_dim], nout=p_z_dim, unit='relu', init_W=init_W, init_b=init_b) prior_mu = FullyConnectedLayer(name='prior_mu', parent=['prior_1'], parent_dim=[p_z_dim], nout=z_dim, unit='linear', init_W=init_W, init_b=init_b) prior_sig = FullyConnectedLayer(name='prior_sig', parent=['prior_1'], parent_dim=[p_z_dim], nout=z_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_1 = FullyConnectedLayer(name='theta_1', parent=['z_1', 's_tm1'], parent_dim=[z2s_dim, rnn_dim], nout=p_x_dim, unit='relu', init_W=init_W, init_b=init_b) theta_mu1 = FullyConnectedLayer(name='theta_mu1', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu2 = FullyConnectedLayer(name='theta_mu2', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu3 = FullyConnectedLayer(name='theta_mu3', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu4 = FullyConnectedLayer(name='theta_mu4', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_mu5 = FullyConnectedLayer(name='theta_mu5', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='linear', init_W=init_W, init_b=init_b) theta_sig1 = FullyConnectedLayer(name='theta_sig1', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig2 = FullyConnectedLayer(name='theta_sig2', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig3 = FullyConnectedLayer(name='theta_sig3', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig4 = FullyConnectedLayer(name='theta_sig4', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) theta_sig5 = FullyConnectedLayer(name='theta_sig5', parent=['theta_1'], parent_dim=[p_x_dim], nout=target_dim, unit='softplus', cons=1e-4, init_W=init_W, init_b=init_b_sig) coeff1 = FullyConnectedLayer(name='coeff1', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff2 = FullyConnectedLayer(name='coeff2', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff3 = FullyConnectedLayer(name='coeff3', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff4 = FullyConnectedLayer(name='coeff4', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) coeff5 = FullyConnectedLayer(name='coeff5', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='softmax', init_W=init_W, init_b=init_b) corr = FullyConnectedLayer(name='corr', parent=['theta_1'], parent_dim=[p_x_dim], nout=k, unit='tanh', init_W=init_W, init_b=init_b) binary = FullyConnectedLayer(name='binary', parent=['theta_1'], parent_dim=[p_x_dim], nout=1, unit='sigmoid', init_W=init_W, init_b=init_b) nodes = [ rnn, x_1, y_1, z_1, #dissag_pred, phi_1, phi_mu, phi_sig, prior_1, prior_mu, prior_sig, theta_1, theta_mu1, theta_sig1, coeff1, theta_mu2, theta_sig2, coeff2, theta_mu3, theta_sig3, coeff3, theta_mu4, theta_sig4, coeff4, theta_mu5, theta_sig5, coeff5 ] params = OrderedDict() for node in nodes: if node.initialize() is not None: params.update(node.initialize()) params = init_tparams(params) s_0 = rnn.get_init_state(batch_size) x_1_temp = x_1.fprop([x], params) y_1_temp = y_1.fprop([y], params) def inner_fn_test(x_t, s_tm1): prior_1_t = prior_1.fprop([x_t, s_tm1], params) prior_mu_t = prior_mu.fprop([prior_1_t], params) prior_sig_t = prior_sig.fprop([prior_1_t], params) z_t = Gaussian_sample( prior_mu_t, prior_sig_t ) #in the original code it is gaussian. GMM is for the generation z_1_t = z_1.fprop([z_t], params) theta_1_t = theta_1.fprop([z_1_t, s_tm1], params) theta_mu1_t = theta_mu1.fprop([theta_1_t], params) theta_sig1_t = theta_sig1.fprop([theta_1_t], params) coeff1_t = coeff1.fprop([theta_1_t], params) y_pred1 = GMM_sampleY( theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t) theta_mu2_t = theta_mu2.fprop([theta_1_t], params) theta_sig2_t = theta_sig2.fprop([theta_1_t], params) coeff2_t = coeff2.fprop([theta_1_t], params) y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t) y_pred1 = T.concatenate([y_pred1, y_pred2], axis=1) theta_mu3_t = theta_mu3.fprop([theta_1_t], params) theta_sig3_t = theta_sig3.fprop([theta_1_t], params) coeff3_t = coeff3.fprop([theta_1_t], params) y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t) y_pred1 = T.concatenate([y_pred1, y_pred3], axis=1) theta_mu4_t = theta_mu4.fprop([theta_1_t], params) theta_sig4_t = theta_sig4.fprop([theta_1_t], params) coeff4_t = coeff4.fprop([theta_1_t], params) y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t) y_pred1 = T.concatenate([y_pred1, y_pred4], axis=1) theta_mu5_t = theta_mu5.fprop([theta_1_t], params) theta_sig5_t = theta_sig5.fprop([theta_1_t], params) coeff5_t = coeff5.fprop([theta_1_t], params) y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t) y_pred1 = T.concatenate([y_pred1, y_pred5], axis=1) pred_1_t = y_1.fprop([y_pred1], params) #y_pred = [GMM_sampleY(theta_mu_t[i], theta_sig_t[i], coeff_t[i]) for i in range(y_dim)]#T.stack([y_pred1,y_pred2],axis = 0 ) s_t = rnn.fprop([[x_t, z_1_t, pred_1_t], [s_tm1]], params) #y_pred = dissag_pred.fprop([s_t], params) return s_t, prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1, theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2, theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3, theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4, theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5 #corr_temp, binary_temp ((s_temp_val, prior_mu_temp_val, prior_sig_temp_val, theta_mu1_temp_val, theta_sig1_temp_val, coeff1_temp_val, y_pred1_temp_val, theta_mu2_temp_val, theta_sig2_temp_val, coeff2_temp_val, y_pred2_temp_val, theta_mu3_temp_val, theta_sig3_temp_val, coeff3_temp_val, y_pred3_temp_val, theta_mu4_temp_val, theta_sig4_temp_val, coeff4_temp_val, y_pred4_temp_val, theta_mu5_temp_val, theta_sig5_temp_val, coeff5_temp_val, y_pred5_temp_val), updates_val) = theano.scan(fn=inner_fn_test, sequences=[x_1_temp], outputs_info=[ s_0, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None ]) for k, v in updates_val.iteritems(): k.default_update = v def inner_fn(x_t, y_t, s_tm1): phi_1_t = phi_1.fprop([x_t, s_tm1, y_t], params) phi_mu_t = phi_mu.fprop([phi_1_t], params) phi_sig_t = phi_sig.fprop([phi_1_t], params) prior_1_t = prior_1.fprop([x_t, s_tm1], params) prior_mu_t = prior_mu.fprop([prior_1_t], params) prior_sig_t = prior_sig.fprop([prior_1_t], params) z_t = Gaussian_sample( phi_mu_t, phi_sig_t ) #in the original code it is gaussian. GMM is for the generation z_1_t = z_1.fprop([z_t], params) theta_1_t = theta_1.fprop([z_1_t, s_tm1], params) theta_mu1_t = theta_mu1.fprop([theta_1_t], params) theta_sig1_t = theta_sig1.fprop([theta_1_t], params) coeff1_t = coeff1.fprop([theta_1_t], params) y_pred1 = GMM_sampleY( theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t) theta_mu2_t = theta_mu2.fprop([theta_1_t], params) theta_sig2_t = theta_sig2.fprop([theta_1_t], params) coeff2_t = coeff2.fprop([theta_1_t], params) y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t) theta_mu3_t = theta_mu3.fprop([theta_1_t], params) theta_sig3_t = theta_sig3.fprop([theta_1_t], params) coeff3_t = coeff3.fprop([theta_1_t], params) y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t) theta_mu4_t = theta_mu4.fprop([theta_1_t], params) theta_sig4_t = theta_sig4.fprop([theta_1_t], params) coeff4_t = coeff4.fprop([theta_1_t], params) y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t) theta_mu5_t = theta_mu5.fprop([theta_1_t], params) theta_sig5_t = theta_sig5.fprop([theta_1_t], params) coeff5_t = coeff5.fprop([theta_1_t], params) y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t) s_t = rnn.fprop([[x_t, z_1_t, y_t], [s_tm1]], params) return s_t, phi_mu_t, phi_sig_t, prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1, theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2, theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3, theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4, theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5 #corr_temp, binary_temp ((s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp, theta_mu1_temp, theta_sig1_temp, coeff1_temp, y_pred1_temp, theta_mu2_temp, theta_sig2_temp, coeff2_temp, y_pred2_temp, theta_mu3_temp, theta_sig3_temp, coeff3_temp, y_pred3_temp, theta_mu4_temp, theta_sig4_temp, coeff4_temp, y_pred4_temp, theta_mu5_temp, theta_sig5_temp, coeff5_temp, y_pred5_temp), updates) = theano.scan(fn=inner_fn, sequences=[x_1_temp, y_1_temp], outputs_info=[ s_0, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None ]) for k, v in updates.iteritems(): k.default_update = v theta_mu1_temp.name = 'theta_mu1' theta_sig1_temp.name = 'theta_sig1' coeff1_temp.name = 'coeff1' y_pred1_temp.name = 'disaggregation1' #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1) mse1 = T.mean((y_pred1_temp - y[:, :, 0].reshape( (y.shape[0], y.shape[1], 1)))**2) mae1 = T.mean( T.abs_(y_pred1_temp - y[:, :, 0].reshape((y.shape[0], y.shape[1], 1)))) mse1.name = 'mse1' mae1.name = 'mae1' kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp) x_shape = x.shape y_shape = y.shape theta_mu2_temp.name = 'theta_mu2' theta_sig2_temp.name = 'theta_sig2' coeff2_temp.name = 'coeff2' y_pred2_temp.name = 'disaggregation2' mse2 = T.mean((y_pred2_temp - y[:, :, 1].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae2 = T.mean( T.abs_(y_pred2_temp - y[:, :, 1].reshape((y.shape[0], y.shape[1], 1)))) mse2.name = 'mse2' mae2.name = 'mae2' theta_mu3_temp.name = 'theta_mu3' theta_sig3_temp.name = 'theta_sig3' coeff3_temp.name = 'coeff3' y_pred3_temp.name = 'disaggregation3' mse3 = T.mean((y_pred3_temp - y[:, :, 2].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae3 = T.mean( T.abs_(y_pred3_temp - y[:, :, 2].reshape((y.shape[0], y.shape[1], 1)))) mse3.name = 'mse3' mae3.name = 'mae3' theta_mu4_temp.name = 'theta_mu4' theta_sig4_temp.name = 'theta_sig4' coeff4_temp.name = 'coeff4' y_pred4_temp.name = 'disaggregation4' mse4 = T.mean((y_pred4_temp - y[:, :, 3].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae4 = T.mean( T.abs_(y_pred4_temp - y[:, :, 3].reshape((y.shape[0], y.shape[1], 1)))) mse4.name = 'mse4' mae4.name = 'mae4' theta_mu5_temp.name = 'theta_mu5' theta_sig5_temp.name = 'theta_sig5' coeff5_temp.name = 'coeff5' y_pred5_temp.name = 'disaggregation5' mse5 = T.mean((y_pred5_temp - y[:, :, 4].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae5 = T.mean( T.abs_(y_pred5_temp - y[:, :, 4].reshape((y.shape[0], y.shape[1], 1)))) mse5.name = 'mse5' mae5.name = 'mae5' kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp) theta_mu1_in = theta_mu1_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig1_in = theta_sig1_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff1_in = coeff1_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_mu2_in = theta_mu2_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig2_in = theta_sig2_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff2_in = coeff2_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_mu3_in = theta_mu3_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig3_in = theta_sig3_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff3_in = coeff3_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_mu4_in = theta_mu4_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig4_in = theta_sig4_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff4_in = coeff4_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_mu5_in = theta_mu5_temp.reshape((x_shape[0] * x_shape[1], -1)) theta_sig5_in = theta_sig5_temp.reshape((x_shape[0] * x_shape[1], -1)) coeff5_in = coeff5_temp.reshape((x_shape[0] * x_shape[1], -1)) x_shape = x.shape y_shape = y.shape #x_in = x.reshape((x_shape[0]*x_shape[1], -1)) y_in = y.reshape((y_shape[0] * y_shape[1], -1)) recon = GMMdisagMulti(y_dim, y_in, theta_mu1_in, theta_sig1_in, coeff1_in, theta_mu2_in, theta_sig2_in, coeff2_in, theta_mu3_in, theta_sig3_in, coeff3_in, theta_mu4_in, theta_sig4_in, coeff4_in, theta_mu5_in, theta_sig5_in, coeff5_in) #recon = GMMdisagMulti(y_dim, y_in, theta_mu1_in, theta_sig1_in, coeff1_in, theta_mu2_in, theta_sig2_in, coeff2_in,theta_mu3_in, theta_sig3_in, coeff3_in,theta_mu4_in, theta_sig4_in, coeff4_in,theta_mu5_in, theta_sig5_in, coeff5_in) recon = recon.reshape((x_shape[0], x_shape[1])) recon.name = 'gmm_out' ''' recon5 = GMM(y_in[:,4, None], theta_mu5_in, theta_sig5_in, coeff5_in) recon5 = recon.reshape((x_shape[0], x_shape[1])) ''' recon_term = recon.sum(axis=0).mean() recon_term = recon.sum(axis=0).mean() recon_term.name = 'recon_term' kl_term = kl_temp.sum(axis=0).mean() kl_term.name = 'kl_term' nll_upper_bound = recon_term + kl_term nll_upper_bound.name = 'nll_upper_bound' ######################## TEST (GENERATION) TIME #s_temp_val = concatenate([s_0[None, :, :], s_temp_val[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0 theta_mu1_temp_val.name = 'theta_mu1_val' theta_sig1_temp_val.name = 'theta_sig1_val' coeff1_temp_val.name = 'coeff1_val' y_pred1_temp_val.name = 'disaggregation1_val' #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1) mse1_val = T.mean((y_pred1_temp_val - y[:, :, 0].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae1_val = T.mean( T.abs_(y_pred1_temp_val - y[:, :, 0].reshape((y.shape[0], y.shape[1], 1)))) #NEURALNILM #(sum_output - sum_target) / max(sum_output, sum_target)) totPred = T.sum(y_pred1_temp_val) totReal = T.sum(y[:, :, 0]) relErr1_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned1_val = 1 - T.sum( T.abs_(y_pred1_temp_val - y[:, :, 0].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) #y_unNormalize = (y[:,:,0] * reader.stdTraining[0]) + reader.meanTraining[0] #y_pred1_temp_val = (y_pred1_temp_val * reader.stdTraining[0]) + reader.meanTraining[0] #mse1_valUnNorm = T.mean((y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all #mae1_valUnNorm = T.mean( T.abs_(y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))) mse1_val.name = 'mse1_val' mae1_val.name = 'mae1_val' theta_mu1_in_val = theta_mu1_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig1_in_val = theta_sig1_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff1_in_val = coeff1_temp_val.reshape((x_shape[0] * x_shape[1], -1)) theta_mu2_temp_val.name = 'theta_mu2_val' theta_sig2_temp_val.name = 'theta_sig2_val' coeff2_temp_val.name = 'coeff2_val' y_pred2_temp_val.name = 'disaggregation2_val' mse2_val = T.mean((y_pred2_temp_val - y[:, :, 1].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae2_val = T.mean( T.abs_(y_pred2_temp_val - y[:, :, 1].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred2_temp_val) totReal = T.sum(y[:, :, 1]) relErr2_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned2_val = 1 - T.sum( T.abs_(y_pred2_temp_val - y[:, :, 1].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) mse2_val.name = 'mse2_val' mae2_val.name = 'mae2_val' theta_mu2_in_val = theta_mu2_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig2_in_val = theta_sig2_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff2_in_val = coeff2_temp_val.reshape((x_shape[0] * x_shape[1], -1)) theta_mu3_temp_val.name = 'theta_mu3_val' theta_sig3_temp_val.name = 'theta_sig3_val' coeff3_temp_val.name = 'coeff3_val' y_pred3_temp_val.name = 'disaggregation3_val' mse3_val = T.mean((y_pred3_temp_val - y[:, :, 2].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae3_val = T.mean( T.abs_(y_pred3_temp_val - y[:, :, 2].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred3_temp_val) totReal = T.sum(y[:, :, 2]) relErr3_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned3_val = 1 - T.sum( T.abs_(y_pred3_temp_val - y[:, :, 2].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) mse3_val.name = 'mse3_val' mae3_val.name = 'mae3_val' theta_mu3_in_val = theta_mu3_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig3_in_val = theta_sig3_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff3_in_val = coeff3_temp_val.reshape((x_shape[0] * x_shape[1], -1)) theta_mu4_temp_val.name = 'theta_mu4_val' theta_sig4_temp_val.name = 'theta_sig4_val' coeff4_temp_val.name = 'coeff4_val' y_pred4_temp_val.name = 'disaggregation4_val' mse4_val = T.mean((y_pred4_temp_val - y[:, :, 3].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae4_val = T.mean( T.abs_(y_pred4_temp_val - y[:, :, 3].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred4_temp_val) totReal = T.sum(y[:, :, 3]) relErr4_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned4_val = 1 - T.sum( T.abs_(y_pred4_temp_val - y[:, :, 3].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) mse4_val.name = 'mse4_val' mae4_val.name = 'mae4_val' theta_mu4_in_val = theta_mu4_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig4_in_val = theta_sig4_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff4_in_val = coeff4_temp_val.reshape((x_shape[0] * x_shape[1], -1)) theta_mu5_temp_val.name = 'theta_mu5_val' theta_sig5_temp_val.name = 'theta_sig5_val' coeff5_temp_val.name = 'coeff5_val' y_pred5_temp_val.name = 'disaggregation5_val' mse5_val = T.mean((y_pred5_temp_val - y[:, :, 4].reshape( (y.shape[0], y.shape[1], 1)))**2) # As axis = None is calculated for all mae5_val = T.mean( T.abs_(y_pred5_temp_val - y[:, :, 4].reshape((y.shape[0], y.shape[1], 1)))) totPred = T.sum(y_pred5_temp_val) totReal = T.sum(y[:, :, 4]) relErr5_val = (totPred - totReal) / T.maximum(totPred, totReal) propAssigned5_val = 1 - T.sum( T.abs_(y_pred5_temp_val - y[:, :, 4].reshape( (y.shape[0], y.shape[1], 1)))) / (2 * T.sum(x)) mse5_val.name = 'mse5_val' mae5_val.name = 'mae5_val' theta_mu5_in_val = theta_mu5_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) theta_sig5_in_val = theta_sig5_temp_val.reshape( (x_shape[0] * x_shape[1], -1)) coeff5_in_val = coeff5_temp_val.reshape((x_shape[0] * x_shape[1], -1)) prediction_val = T.concatenate([ y_pred1_temp_val, y_pred2_temp_val, y_pred3_temp_val, y_pred4_temp_val, y_pred5_temp_val ], axis=2) recon_val = GMMdisagMulti( y_dim, y_in, theta_mu1_in_val, theta_sig1_in_val, coeff1_in_val, theta_mu2_in_val, theta_sig2_in_val, coeff2_in_val, theta_mu3_in_val, theta_sig3_in_val, coeff3_in_val, theta_mu4_in_val, theta_sig4_in_val, coeff4_in_val, theta_mu5_in_val, theta_sig5_in_val, coeff5_in_val) recon_val = recon_val.reshape((x_shape[0], x_shape[1])) recon_val.name = 'gmm_out' totaMSE_val = (mse1_val + mse2_val + mse3_val + mse4_val + mse5_val) / y_dim totaMAE_val = (mae1_val + mae2_val + mae3_val + mae4_val + mae5_val) / y_dim ''' recon5 = GMM(y_in[:,4, None], theta_mu5_in, theta_sig5_in, coeff5_in) recon5 = recon.reshape((x_shape[0], x_shape[1])) ''' recon_term_val = recon_val.sum(axis=0).mean() recon_term_val = recon_val.sum(axis=0).mean() recon_term_val.name = 'recon_term' ###################### model.inputs = [x, mask, y, y_mask, scheduleSamplingMask] model.params = params model.nodes = nodes optimizer = Adam(lr=lr) header = "epoch,log,kl,nll_upper_bound,mse,mae\n" extension = [ GradientClipping(batch_size=batch_size), EpochCount(epoch, save_path, header), Monitoring( freq=monitoring_freq, ddout=[ nll_upper_bound, recon_term, kl_term, mse1, mae1, mse2, mae2, mse3, mae3, mse4, mae4, mse5, mae5, y_pred1_temp, y_pred2_temp, y_pred3_temp, y_pred4_temp, y_pred5_temp ], indexSep=13, indexDDoutPlot=[13], # adding indexes of ddout for the plotting #, (6,y_pred_temp) instancesPlot=instancesPlot, #0-150 data=[Iterator(valid_data, batch_size)], savedFolder=save_path), Picklize(freq=monitoring_freq, path=save_path), EarlyStopping(freq=monitoring_freq, path=save_path, channel=channel_name), WeightNorm() ] lr_iterations = {0: lr} mainloop = Training(name=pkl_name, data=Iterator(train_data, batch_size), model=model, optimizer=optimizer, cost=nll_upper_bound, outputs=[nll_upper_bound], n_steps=n_steps, extension=extension, lr_iterations=lr_iterations, k_speedOfconvergence=30) mainloop.run() data = Iterator(test_data, batch_size) test_fn = theano.function( inputs=[x, y], #[x, y], #givens={x:Xtest}, #on_unused_input='ignore', #z=( ,200,1) allow_input_downcast=True, outputs=[ prediction_val, recon_term_val, totaMSE_val, totaMAE_val, mse1_val, mse2_val, mse3_val, mse4_val, mse5_val, mae1_val, mae2_val, mae3_val, mae4_val, mae5_val, relErr1_val, relErr2_val, relErr3_val, relErr4_val, relErr5_val, propAssigned1_val, propAssigned2_val, propAssigned3_val, propAssigned4_val, propAssigned5_val ] #prediction_val, mse_val, mae_val , updates= updates_val #, allow_input_downcast=True, on_unused_input='ignore' ) testOutput = [] testMetrics2 = [] numBatchTest = 0 for batch in data: outputGeneration = test_fn(batch[0], batch[2]) testOutput.append(outputGeneration[1:14]) testMetrics2.append(outputGeneration[14:]) #{0:[4,20], 2:[5,10]} #if (numBatchTest==0): plt.figure(1) plt.plot(np.transpose(outputGeneration[0], [1, 0, 2])[4]) #ORIGINAL 1,0,2 plt.savefig(save_path + "/vrnn_dis_generated{}_Pred_0-4".format(numBatchTest)) plt.clf() plt.figure(2) plt.plot(np.transpose(batch[2], [1, 0, 2])[4]) plt.savefig(save_path + "/vrnn_dis_generated{}_RealDisag_0-4".format(numBatchTest)) plt.clf() plt.figure(3) plt.plot(np.transpose(batch[0], [1, 0, 2])[4]) #ORIGINAL 1,0,2 plt.savefig(save_path + "/vrnn_dis_generated{}_Realagg_0-4".format(numBatchTest)) plt.clf() numBatchTest += 1 testOutput = np.asarray(testOutput) testMetrics2 = np.asarray(testMetrics2) print(testOutput.shape) print(testMetrics2.shape) recon_test = testOutput[:, 0].mean() mse_test = testOutput[:, 1].mean() mae_test = testOutput[:, 2].mean() mse1_test = testOutput[:, 3].mean() mae1_test = testOutput[:, 8].mean() mse2_test = testOutput[:, 4].mean() mae2_test = testOutput[:, 9].mean() mse3_test = testOutput[:, 5].mean() mae3_test = testOutput[:, 10].mean() mse4_test = testOutput[:, 6].mean() mae4_test = testOutput[:, 11].mean() mse5_test = testOutput[:, 7].mean() mae5_test = testOutput[:, 12].mean() relErr1_test = testMetrics2[:, 0].mean() relErr2_test = testMetrics2[:, 1].mean() relErr3_test = testMetrics2[:, 2].mean() relErr4_test = testMetrics2[:, 3].mean() relErr5_test = testMetrics2[:, 4].mean() propAssigned1_test = testMetrics2[:, 5].mean() propAssigned2_test = testMetrics2[:, 6].mean() propAssigned3_test = testMetrics2[:, 7].mean() propAssigned4_test = testMetrics2[:, 8].mean() propAssigned5_test = testMetrics2[:, 9].mean() fLog = open(save_path + '/output.csv', 'w') fLog.write(str(lr_iterations) + "\n") fLog.write(str(appliances) + "\n") fLog.write(str(windows) + "\n") fLog.write( "logTest,mse1_test,mse2_test,mse3_test,mse4_test,mse5_test,mae1_test,mae2_test,mae3_test,mae4_test,mae5_test,mseTest,maeTest\n" ) fLog.write("{},{},{},{},{},{},{},{},{},{},{},{},{}\n\n".format( recon_test, mse1_test, mse2_test, mse3_test, mse4_test, mse5_test, mae1_test, mae2_test, mae3_test, mae4_test, mae5_test, mse_test, mae_test)) fLog.write( "relErr1,relErr2,relErr3,relErr4,relErr5,propAssigned1,propAssigned2,propAssigned3,propAssigned4,propAssigned5\n" ) fLog.write("{},{},{},{},{},{},{},{},{},{}\n".format( relErr1_test, relErr2_test, relErr3_test, relErr4_test, relErr5_test, propAssigned1_test, propAssigned2_test, propAssigned3_test, propAssigned4_test, propAssigned5_test)) fLog.write("q_z_dim,p_z_dim,p_x_dim,x2s_dim,y2s_dim,z2s_dim\n") fLog.write("{},{},{},{},{},{}\n".format(q_z_dim, p_z_dim, p_x_dim, x2s_dim, y2s_dim, z2s_dim)) fLog.write( "epoch,log,kl,mse1,mse2,mse3,mse4,mse5,mae1,mae2,mae3,mae4,mae5\n") for i, item in enumerate(mainloop.trainlog.monitor['nll_upper_bound']): d, e, f, g, j, k, l, m = 0, 0, 0, 0, 0, 0, 0, 0 ep = mainloop.trainlog.monitor['epoch'][i] a = mainloop.trainlog.monitor['recon_term'][i] b = mainloop.trainlog.monitor['kl_term'][i] c = mainloop.trainlog.monitor['mse1'][i] h = mainloop.trainlog.monitor['mae1'][i] d = mainloop.trainlog.monitor['mse2'][i] j = mainloop.trainlog.monitor['mae2'][i] e = mainloop.trainlog.monitor['mse3'][i] k = mainloop.trainlog.monitor['mae3'][i] f = mainloop.trainlog.monitor['mse4'][i] l = mainloop.trainlog.monitor['mae4'][i] g = mainloop.trainlog.monitor['mse5'][i] m = mainloop.trainlog.monitor['mae5'][i] fLog.write( "{:d},{:.2f},{:.2f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n" .format(ep, a, b, c, d, e, f, g, h, j, k, l, m)) f = open(save_path + '/outputRealGeneration.pkl', 'wb') pickle.dump(outputGeneration, f, -1) f.close()
def __init__( self, glimpse_shape, glimpse_times, dim_hidden, dim_fc, dim_out, reward_base, rng_std=1.0, activation=T.tanh, bptt_truncate=-1, lmbd=0.1 # gdupdate + lmbd*rlupdate ): if reward_base == None: reward_base = np.zeros((glimpse_times)).astype('float32') reward_base[-1] = 1.0 x = T.ftensor3('x') # N * W * H y = T.ivector('y') # label lr = T.fscalar('lr') reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype( theano.config.floatX), borrow=True) # Time (vector) reward_bias = T.fvector('reward_bias') rng = MRG_RandomStreams(np.random.randint(9999999)) # rng = theano.tensor.shared_randomstreams.RandomStreams(np.random.randint(9999999)) i = InputLayer(x) au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate) # All hidden states are put into decoder # layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))] # dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out] # Only the last hidden states layers = [i, au, InputLayer(au.output[:, -1, :])] dim_fc = [dim_hidden] + dim_fc + [dim_out] for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]): fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC') layers.append(fc) sm = SoftmaxLayer(layers[-1].output) layers.append(sm) output = sm.output # N * classes hidoutput = au.output # N * dim_output location = au.location # N * T * dim_hidden prediction = output.argmax(1) # N # calc equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...] correct = T.cast(T.sum(equalvec), 'float32') # noequalvec = T.neq(prediction, y) # nocorrect = T.cast(T.sum(noequalvec), 'float32') logLoss = T.log(output)[T.arange(y.shape[0]), y] # reward_biased = T.outer(equalvec, reward_base) - reward_bias.dimshuffle('x', 0) # N * Time # (R_t - b_t), where b = E[R] # gradient descent gdobjective = logLoss.sum() / x.shape[ 0] # correct * dim_output (only has value on the correctly predicted sample) gdparams = reduce(lambda x, y: x + y.params, layers, []) gdupdates = map(lambda x: (x, x + lr * T.grad(gdobjective, x)), gdparams) # reinforce learning rlobjective = (reward_biased.dimshuffle(0, 1, 'x') * T.log(au.location_p)).sum() / x.shape[0] # location_p: N * Time * 2 # location_logp: N * Time # reward_biased: N * 2 rlparams = au.reinforceParams rlupdates = map(lambda x: (x, x + lr * lmbd * T.grad(rlobjective, x)), rlparams) # Hidden state keeps unchange in time deltas = T.stack(*[((au.output[:, i, :].mean(0) - au.output[:, i + 1, :].mean(0))**2).sum() for i in xrange(glimpse_times - 1)]) # N * Time * dim_hidden print 'compile step()' self.step = theano.function([x, y, lr, reward_bias], [ gdobjective, rlobjective, correct, T.outer(equalvec, reward_base) ], updates=gdupdates + rlupdates) # print 'compile gdstep()' # self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates) # print 'compile rlstep()' # self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates) print 'compile predict()' self.predict = theano.function([x], prediction) # print 'compile forward()' # self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output]) # print 'compile error()' # self.error = theano.function([x, y], gdobjective) print 'compile locate()' self.locate = theano.function( [x], [au.location_mean, location]) #[layers[-3].output, fc.output]) print 'compile debug()' self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn') # self.xxx self.glimpse_times = glimpse_times
def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): print "USING OPTION CRITIC" self.args = args self.id_num = id_num self.num_actions = num_actions self.num_moves = num_moves self.reset_storing() self.rng = np.random.RandomState(100 + id_num) # input is 8x8 model_network = [{ "model_type": "conv", "filter_size": [4, 4], "pool": [1, 1], "stride": [2, 2], "out_size": 32, "activation": "relu" }, { "model_type": "conv", "filter_size": [3, 3], "pool": [1, 1], "stride": [2, 2], "out_size": 64, "activation": "relu" }, { "model_type": "mlp", "out_size": 48, "activation": "relu" }, { "model_type": "mlp", "out_size": 32, "activation": "relu" }] out = [None, model_network[-1]["out_size"]] self.conv = Model(model_network, input_size=[ None, args.concat_frames * (1 if args.grayscale else 3), 8, 8 ]) self.termination_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W": 0 }], input_size=out) self.Q_val_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params self.set_rms_shared_weights(shared_arr) x = T.ftensor4() y = T.fvector() a = T.ivector() o = T.ivector() delib = T.fscalar() s = self.conv.apply(x / np.float32(255)) intra_option_policy = self.options_model.apply(s, o) q_vals = self.Q_val_model.apply(s) disc_q = theano.gradient.disconnected_grad(q_vals) current_option_q = q_vals[T.arange(o.shape[0]), o] disc_opt_q = disc_q[T.arange(o.shape[0]), o] terms = self.termination_model.apply(s) o_term = terms[T.arange(o.shape[0]), o] V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + ( self.args.option_epsilon * T.mean(q_vals, axis=1)) disc_V = theano.gradient.disconnected_grad(V) aggr = T.mean # T.sum log_eps = 0.0001 critic_cost = aggr(args.critic_coef * 0.5 * T.sqr(y - current_option_q)) termination_grad = aggr(o_term * ((disc_opt_q - disc_V) + delib)) entropy = -aggr( T.sum(intra_option_policy * T.log(intra_option_policy + log_eps), axis=1)) * args.entropy_reg pg = aggr( (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) * (y - disc_opt_q)) cost = pg + entropy - critic_cost - termination_grad grads = T.grad(cost * args.update_freq, self.params) # grads = T.grad(cost, self.params) updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) self.share_rms(shared_arr) self.get_state = theano.function([x], s, on_unused_input='warn') self.get_policy = theano.function([s, o], intra_option_policy) self.get_termination = theano.function([x], terms) self.get_q = theano.function([x], q_vals) self.get_q_from_s = theano.function([s], q_vals) self.get_V = theano.function([x], V) self.rms_grads = theano.function([x, a, y, o, delib], grad_rms, updates=updates, on_unused_input='warn') print "ALL COMPILED" if not self.args.testing: self.init_tracker() self.initialized = False
def fit(self, learning_rate=1e-6, momentum=1e-8, batch=200, activation=T.tanh, depth=7): self.f = activation #define set of input and corresponding output for supervise learning X = [[]] Y = [[]] #theano input-output vectors thX = T.fvector('X') thY = T.fvector('Y') thK = T.iscalar('depth') #reccurent call of evaluation, return next pair of input\reccurent hidden values def recurrence(x_t, h_t1): #update reccurent hidden values #h_t = f(Wx*x + Wh*h_t1 + b) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) #calculate current output, note in our model it is the next time step disctribution #y_t = f(Wo*h_t + b) y_t = self.f(h_t.dot(self.Wo) + self.bo) return h_t, y_t #define theano scan function for call [h, y], _ = th.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=thX, n_steps=thK, ) #define prediction, should be normalyze function #temporal approache -- softmax prediction = T.softmax(Y) #define learning model #for the cost is usuall log loss function cost = -T.mean(T.log(Y[T.arange(thY.shape[0]), thY])) #for grad use theano grad function grads = T.gtrad(cost, self.params) #calculate the change of params for momentum #init to all zero dparams = [theano.shared(p.get_value() * 0) for p in self.params] #define the update using gradient decent algorithm with momentum #i.e. w <- w + momentum * dw - n * grad_w(E) # dw<- momentum * dw - n * grad_w(E) updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] #define complete training model for theano self.predict_op = th.function(inputs=[thX, thK], outputs=prediction) self.train_op = th.function(inputs=[thX, thY], outputs=[cost, prediction, y], updates=updates)
def build_train_func(self, solver_mode="sgd", cost_factors=[], use_acc_mode=False, skip_build=False): #arguments to function logging.info( "Building training functions - solver: %s, use_acc_mode: %s" % (solver_mode, use_acc_mode)) iteration = tensor.fscalar() learn_rate = tensor.fscalar() momentum = tensor.fvector() decay = tensor.fscalar() #find costs self.yt = [] self.cost_list = [] self.cost_layers = [] self.cost_layer_names = [] for layer in self.layers: yt_index = tensor.lvector("target index %i" % len(self.cost_layers)) yt_value = tensor.fvector("target value %i" % len(self.cost_layers)) cost = layer.cost(yt_index, yt_value) if not cost is None: self.yt += [yt_index, yt_value] self.cost_list.append(cost) self.cost_layers.append(layer) self.cost_layer_names.append(layer.type_name) self.cost_factors = [1.0] * len(self.cost_list) if len( cost_factors) == 0 else cost_factors assert len(self.cost_factors) == len( self.cost_list ), "Different number of cost factors (%i) and cost layers (%i)" % (len( self.cost_factors), len(self.cost_layers)) logging.info("Found %i costs in model:" % len(self.cost_layers), list(zip(self.cost_layer_names, self.cost_factors))) self.train_cost = tensor.as_tensor_variable(0) for i, cost in enumerate(self.cost_list): self.train_cost += self.cost_factors[i] * cost if self.gradient_clip > 0.0: logging.info("Clipping gradient to [%f,%f]" % (-self.gradient_clip, self.gradient_clip)) self.train_cost = theano.gradient.grad_clip( self.train_cost, -self.gradient_clip, self.gradient_clip) #find split points split_points = [0] self.use_split_mode = False for index, layer in enumerate(self.layers): if layer.has_split: self.use_split_mode = True split_points.append(index) split_points.append(len(self.layers)) if self.use_split_mode: logging.verbose("Using split mode with split points:", split_points) self.func["train_fwd"] = [] self.func["train_bwd"] = [] self.updates = [] for sp in range(len(split_points) - 1): logging.info("Building training functions for layers %i-%i" % (split_points[sp], split_points[sp + 1])) split_start = self.layers[split_points[sp]] if sp > 0 else None split_end = self.layers[split_points[sp + 1]] if ( sp + 2) < len(split_points) else None split_cost = self.train_cost if split_end is None else None split_layers = [] for i, layer in enumerate(self.layers): if (i > split_points[sp]) and (i < split_points[sp + 1]): split_layers.append(layer) #determine known_grads provided by previous backward passes from collections import OrderedDict split_known_grads = OrderedDict() for i in range(sp + 1, len(split_points) - 1): split_known_grads.update( self.layers[split_points[i]].split_known_grads()) if len(split_known_grads) == 0: split_known_grads = None # print(split_known_grads) # print(split_known_grads) # print(sp+1, len(split_points)-1) # def get_sgd_updates(p, g): m = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) rho = tensor.switch(tensor.gt(iteration, 0), momentum[0], 0.0) m_update = rho * m + (1.0 - rho) * g p_update = p - learn_rate * m_update return [(p, p_update), (m, m_update)] def get_torch_updates(p, g): m = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) rho = tensor.switch(tensor.gt(iteration, 0), momentum[0], 0.0) m_update = rho * m + g p_update = p - learn_rate * (g + momentum[0] * m_update) return [(p, p_update), (m, m_update)] def get_adam_updates(p, g): eps = 1e-8 m = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) v = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) m_update = momentum[0] * m + (1.0 - momentum[0]) * g v_update = momentum[1] * v + (1.0 - momentum[1]) * (g * g) m_hat = m_update / (1.0 - tensor.pow(momentum[0], iteration + 1)) v_hat = v_update / (1.0 - tensor.pow(momentum[1], iteration + 1)) p_update = p - learn_rate * m_hat / (tensor.sqrt(v_hat) + eps) return [(p, p_update), (m, m_update), (v, v_update)] #append parameter updates params = [] params_decay = [] for layer in split_layers: params += layer.weights() params_decay += [True] * len(layer.weights()) params += layer.biases() params_decay += [False] * len(layer.biases()) #build updates print("known grads:", split_known_grads) grads = tensor.grad(split_cost, params, known_grads=split_known_grads) solver_updates = [] for p, g, p_decay in zip(params, grads, params_decay): #add L2 weight decay if needed if p_decay or self.bias_decay: g += decay * p if solver_mode == "adam": solver_updates += get_adam_updates(p, g) elif solver_mode == "torch" or solver_mode == "nesterov": solver_updates += get_torch_updates(p, g) else: solver_updates += get_sgd_updates(p, g) #append per layer updates local_updates = solver_updates + sum( [layer.updates(self.train_cost) for layer in split_layers], []) #all updates self.updates += local_updates #skipping actual theano function building (if you just want updates, etc) if skip_build: continue global debug_train if debug_train: logging.warning("WARNING: Debug mode is active!") from theano.compile.nanguardmode import NanGuardMode debug_mode = theano.compile.MonitorMode( post_func=debug_detect_errors) else: debug_mode = None if self.use_split_mode: if not split_end is None: updates = sum( [layer.split_forward() for layer in split_layers], []) updates += split_end.split_forward() print("fwd updates:", updates) f = theano.function([self.input], [], updates=updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', mode=debug_mode) self.func["train_fwd"].append(f) outputs = ([self.train_cost] + self.cost_list) if split_end is None else [] updates = sum([ layer.split_backward(split_cost, split_known_grads) for layer in split_layers ], []) if not split_start is None: updates += split_start.split_backward( split_cost, split_known_grads) print("bwd updates:", updates) updates += local_updates f = theano.function([ denet.layer.get_epoch(), iteration, learn_rate, momentum, decay, self.input ] + self.yt, outputs, updates=updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', mode=debug_mode) self.func["train_bwd"].insert(0, f) elif use_acc_mode: acc_counter = theano.shared( numpy.array(0, dtype=theano.config.floatX)) begin_updates = [(acc_counter, tensor.zeros_like(acc_counter))] step_updates = [(acc_counter, acc_counter + 1)] end_updates = [] self.acc_params = [] for p_dest, p_src in self.updates: p_acc = theano.shared(numpy.zeros( p_dest.shape.eval(), dtype=theano.config.floatX), broadcastable=p_dest.broadcastable, borrow=True) begin_updates.append((p_acc, tensor.zeros_like(p_acc))) step_updates.append((p_acc, p_acc + p_src)) end_updates.append((p_dest, p_acc / acc_counter)) self.acc_params.append(p_acc) logging.info( "Constructing parameter accumulate update functions (solver=%s)" % solver_mode) self.func["train_begin"] = theano.function( [], [], updates=begin_updates) self.func["train_step"] = theano.function( [ denet.layer.get_epoch(), iteration, learn_rate, momentum, decay, self.input ] + self.yt, [self.train_cost] + self.cost_list, updates=step_updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', allow_input_downcast=True, mode=debug_mode) self.func["train_end"] = theano.function([], [], updates=end_updates) else: logging.info( "Constructing parameter update function (solver=%s)" % solver_mode) #making f_input = theano.In(self.input, borrow=True) f_yt = [theano.In(yt, borrow=True) for yt in self.yt] self.func["train_step"] = theano.function( [ denet.layer.get_epoch(), iteration, learn_rate, momentum, decay, f_input ] + f_yt, [self.train_cost] + self.cost_list, updates=self.updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', allow_input_downcast=True, mode=debug_mode) logging.verbose("Exporting graph...") with open("graph.txt", "w") as f: theano.printing.debugprint(self.func["train_step"], file=f, print_type=True)
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): """ This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias We check that we loop when their is too much threads """ n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, theano.compile.DebugMode): n_in = 4098 n_out = 4099 x = T.fmatrix('x') y = T.lvector('y') b = T.fvector('b') #W = T.fmatrix('W') #we precompute the dot with big shape before to allow the test of #GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error #(the launch timed out and was terminated) on GPU card not #powerful enough. We need the big shape to check for corner #case. dot_result = T.fmatrix('dot_result') # Seed numpy.random with config.unittests.rseed utt.seed_rng() xx = numpy.asarray(numpy.random.rand(batch_size, n_in), dtype=numpy.float32) #?????yy = numpy.ones((batch_size,),dtype='float32') yy = numpy.ones((batch_size, ), dtype='int32') b_values = numpy.zeros((n_out, ), dtype='float32') W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32') dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32') del W_values p_y_given_x = T.nnet.softmax(dot_result + b) y_pred = T.argmax(p_y_given_x, axis=-1) loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y]) dW = T.grad(loss, dot_result) classify = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu) classify_gpu = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu) #theano.printing.debugprint(classify) #theano.printing.debugprint(classify_gpu) assert any([ isinstance(node.op, T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.fgraph.toposort() ]) out = classify(yy, b_values, dot_value) gout = classify_gpu(yy, b_values, dot_value) assert len(out) == len(gout) == 3 assert numpy.allclose(out[0], gout[0]) assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(gout - out).max() assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val) for id, val in enumerate(out[1] - gout[1]) if val != 0]
margin = T.scalar('margin') loss = mean_loss_kl_div(predictions, targets, margin) loss_fun = theano.function([predictions, targets, margin], loss) mean_err = loss_fun(test_pred, test_targ, test_margin) foreach_prep = foreach(predictions, targets, margin) foreach_fun = theano.function([predictions, targets, margin], foreach_prep) err_mat = foreach_fun(test_pred, test_targ, test_margin) err = err_mat.sum() / ((len(err_mat) - 1) * len(err_mat)) def loss(predictions, targets, margin, f): assert len(predictions) == len(targets) L_sum = 0 for i in range(len(predictions)): for j in range(len(predictions)): L_sum += f(predictions[i], targets[i], predictions[j], targets[j], margin) return L_sum / (2 * len(predictions)) xp = T.scalar('xp') xq = T.scalar('xq') p = T.fvector('P') q = T.fvector('Q') result = loss_with_kl_div(p, xp, q, xq, margin) f = theano.function([p, xp, q, xq, margin], result) mean_np = loss(test_pred, test_targ, test_margin, f) assert (mean_err == err == mean_np) print('Run without errors!')
def soft_cascade_LR_1LNN(trX1, trY1, teX1, teY1, trX2, teX2, lambda_vector, K1): (N, D1) = trX2.shape D = trX1.shape[1] C = 2 t1 = ComputeComplexity([D1, C]) t2 = ComputeComplexity([D, K1, C]) n_it = 10000 time1 = np.zeros((len(lambda_vector), 1)) accuracy1 = np.zeros((len(lambda_vector), 1)) F1 = np.zeros((len(lambda_vector), 1)) nnz_first = np.zeros((len(lambda_vector), 1)) for i, plambda in enumerate(lambda_vector): X = T.fmatrix() F = T.fmatrix() Y = T.fvector() w_l = CF.init_weights((D1, )) b_l = theano.shared(CF.floatX(np.random.randn(1) * 0.01), broadcastable=(True, )) # w_l.set_value(np.zeros((D1,))) # b_l.set_value(np.zeros((1,))) w_h1 = CF.init_weights((D, K1)) b1 = CF.init_weights((K1, )) w_o = CF.init_weights((K1, )) bo = theano.shared(CF.floatX(np.random.randn(1) * 0.01), broadcastable=(True, )) pygx1 = CF.model00(F, w_l, b_l) pygx2 = CF.model3(X, w_h1, w_o, b1, bo, 0, 1) pygx_final = pygx1 * pygx2 yhat1 = (pygx1 > 0.5) yhat = (pygx2 > 0.5) reg = T.mean(t1 + t2 * pygx1) cost = T.mean(T.nnet.binary_crossentropy(pygx_final, Y)) + plambda * reg params = [w_l, b_l, w_h1, w_o, b1, bo] updates = lasagne.updates.rmsprop(cost, params, learning_rate=0.001 * 5, rho=0.9, epsilon=1e-06) # updates = lasagne.updates.adagrad(cost, params, learning_rate=1, epsilon=1e-06) train = theano.function(inputs=[X, F, Y], outputs=cost, updates=updates, allow_input_downcast=True) reg_value = theano.function(inputs=[F], outputs=reg, allow_input_downcast=True) predict_first = theano.function(inputs=[F], outputs=yhat1, allow_input_downcast=True) predict_second = theano.function(inputs=[X], outputs=yhat, allow_input_downcast=True) max_iter = 300 for j in range(max_iter): c = train(trX1, trX2, trY1) r = reg_value(trX2) print(c - plambda * r, plambda * r) start1 = time.clock() for t in range(n_it): teQ1 = predict_first(teX2) end1 = time.clock() time1[i] = end1 - start1 inds_test = np.where(teQ1 == 1)[0] nnz_first[i] = inds_test.shape[0] # check that we get 100 percent recall from the first stage inds_true = np.where(teY1 == 1)[0] int_result = np.intersect1d(inds_test, inds_true) print("first stage nzs:%d,true nzs:%d,intersection:%d" % (inds_test.shape[0], inds_true.shape[0], int_result.shape[0])) r1 = int_result.shape[0] / inds_true.shape[0] p1 = int_result.shape[0] / inds_test.shape[0] a1 = np.mean(teY1 == teQ1) print("first stage: recall = %f, precision = %f, accuracy = %f" % (r1, p1, a1)) teX11 = teX1[inds_test, :] start1 = time.clock() for t in range(n_it): teQ2 = predict_second(teX11) end1 = time.clock() time1[i] += end1 - start1 teY2 = np.zeros(teY1.shape, dtype=int) teY2.fill(0) teY2[inds_test] = teQ2 inds_second = np.where(teY2 == 1)[0] int_result = np.intersect1d(inds_second, inds_true) print("second stage nzs:%d,true nzs:%d,intersection:%d" % (inds_second.shape[0], inds_true.shape[0], int_result.shape[0])) r2 = int_result.shape[0] / inds_true.shape[0] p2 = int_result.shape[0] / inds_second.shape[0] a2 = np.mean(teY1 == teY2) print("second stage: recall = %f, precision = %f, accuracy = %f" % (r2, p2, a2)) F1[i] = 2 * r2 * p2 / (r2 + p2) accuracy1[i] = a2 return time1, accuracy1, F1, nnz_first
def cascade_three_stage(trX1, trY1, teX1, teY1, trX2, teX2, trX3, teX3, w_h1, w_h2, w_o, b1, b2, bo, v_h1, v_o, c1, co, plambda, a): (N,D) = trX3.shape lambda_vector = plambda n_it = 10000 time1 = np.zeros((len(lambda_vector),1)) accuracy1 = np.zeros((len(lambda_vector),1)) F1 = np.zeros((len(lambda_vector),1)) nnz_first = np.zeros((len(lambda_vector),1)) nnz_second = np.zeros((len(lambda_vector),1)) for i,plambda in enumerate(lambda_vector): X = T.fmatrix() F = T.fmatrix() E = T.fmatrix() Y = T.fvector() w_l = CF.init_weights((D,)) b_l = theano.shared(CF.floatX(np.random.randn(1) * 0.01), broadcastable=(True,)) w_l.set_value(np.zeros((D,))) b_l.set_value(np.zeros((1,))) pygx1 = CF.model00(E, w_l, b_l) pygx2 = CF.model3(F, v_h1, v_o, c1, co, 0, 1) pygx = CF.model(X, w_h1, w_h2, w_o, b1, b2, bo, 0, 1) yhat1 = (pygx1 > 0.5) yhat2 = (pygx2 > 0.5) yhat = (pygx > 0.5) f = lambda x, a: 1/(1+T.exp(-a*(x-0.5))) pygx_final = (1-f(pygx1,a))*pygx1 + (1-f(pygx2,a))*f(pygx1,a)*pygx2 + f(pygx1, a)*f(pygx2, a)*pygx reg = T.mean(f(pygx1,a)) cost = T.mean(T.nnet.binary_crossentropy(pygx_final, Y)) + plambda*reg params = [w_l, b_l] updates = lasagne.updates.rmsprop(cost, params, learning_rate=0.5, rho=0.9, epsilon=1e-06) # updates = lasagne.updates.adagrad(cost, params, learning_rate=1, epsilon=1e-06) train = theano.function(inputs=[X, F, E, Y], outputs=cost, updates=updates, allow_input_downcast=True) reg_value = theano.function(inputs=[E], outputs=reg, allow_input_downcast=True) predict_first = theano.function(inputs=[E], outputs=yhat1, allow_input_downcast=True) predict_second = theano.function(inputs=[F], outputs=yhat2, allow_input_downcast=True) predict_third = theano.function(inputs=[X], outputs=yhat, allow_input_downcast=True) max_iter = 500 for j in range(max_iter): # c = train(trX1, trY1) c = train(trX1, trX2, trX3, trY1) # r = reg_value(trX1) r = reg_value(trX3) print(c-plambda*r,plambda*r) # cost = train(trX1, trY1) start1 = time.clock() for t in range(n_it): teQ1 = predict_first(teX3) end1 = time.clock() time1[i] = end1 - start1 inds_test = np.where(teQ1 == 1)[0] nnz_first[i] = inds_test.shape[0] # check that we get 100 percent recall from the first stage inds_true = np.where( teY1 == 1 )[0] int_result = np.intersect1d(inds_test,inds_true) print("first stage nzs:%d,true nzs:%d,intersection:%d" %(inds_test.shape[0],inds_true.shape[0],int_result.shape[0])) r1 = int_result.shape[0] / inds_true.shape[0] p1 = int_result.shape[0] / inds_test.shape[0] a1 = np.mean(teY1 == teQ1) print("first stage: recall = %f, precision = %f, accuracy = %f" %(r1,p1,a1)) teX22 = teX2[inds_test,:] start1 = time.clock() for t in range(n_it): teQ2 = predict_second(teX22) end1 = time.clock() time1[i] += end1 - start1 inds_test2 = np.where(teQ2 == 1)[0] nnz_second[i] = inds_test2.shape[0] teY2 = np.zeros(teY1.shape,dtype = int) teY2.fill(0) teY2[inds_test] = teQ2 inds_second = np.where( teY2 == 1 )[0] int_result = np.intersect1d(inds_second, inds_true) print("second stage nzs:%d,true nzs:%d,intersection:%d" %(inds_second.shape[0],inds_true.shape[0],int_result.shape[0])) r2 = int_result.shape[0] / inds_true.shape[0] p2 = int_result.shape[0] / inds_second.shape[0] a2 = np.mean(teY1 == teY2) print("second stage: recall = %f, precision = %f, accuracy = %f" %(r2,p2,a2)) # teX1 = teX1[inds_test2,:] teX11 = teX1[inds_test[inds_test2],:] start1 = time.clock() for t in range(n_it): teQ3 = predict_third(teX11) end1 = time.clock() time1[i] += end1 - start1 teY3 = np.zeros(teY1.shape,dtype = int) teY3.fill(0) teY3[inds_test[inds_test2]] = teQ3 accuracy1[i] = np.mean(teY1 == teY3) inds_third = np.where( teY3 == 1 )[0] int_result2 = np.intersect1d(inds_third,inds_true) print("third stage nzs:%d,true nzs:%d,intersection:%d" %(inds_third.shape[0],inds_true.shape[0],int_result2.shape[0])) r3 = int_result2.shape[0] / inds_true.shape[0] p3 = int_result2.shape[0] / inds_third.shape[0] print("third stage: recall = %f, precision = %f, accuracy = %f" %(r3, p3, accuracy1[i])) F1[i] = 2*r3*p3/(r3 + p3) return time1, accuracy1, F1, nnz_first, nnz_second
def test_softmax_grad(self): def cmp(n, m, f, f_gpu): data = numpy.arange(n * m, dtype='float32').reshape(n, m) gdata = numpy.asarray(data)[:, :, None, None] out = f(data) gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] utt.assert_allclose(out, gout) x = T.matrix('x', 'float32') x_gpu = T.tensor4('x_gpu', 'float32') f_z = T.nnet.softmax_op f_gpu = dnn.GpuDnnSoftmax('accurate', 'channel') # Verify the grad operation dims = (2, 3, 4, 5) gdata = numpy.arange(numpy.product(dims), dtype='float32').reshape(dims) T.verify_grad(f_gpu, [gdata], rng=numpy.random, mode=mode_with_gpu) # Verify that the CPU and GPU implementations return the same results # up to a tolerance. self._test_softmax(x, x_gpu, f_z, f_gpu, cmp) self._test_softmax(x, x, f_z, f_z, self._cmp) # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # optimization is applied when cudnn is required y = T.fvector('y') f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() val = numpy.random.rand(5).astype('float32') out_dnn = f(val) assert (len( [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 0) # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # optimization is not applied when cudnn is excluded or not # available mode_wo_cudnn = mode_with_gpu.excluding("cudnn") y = T.fvector('y') f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_wo_cudnn) sorted_f = f.maker.fgraph.toposort() out_cpu = f(val) utt.assert_allclose(out_dnn, out_cpu) assert (len( [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 0) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 1) # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not # crash with manual graph y = T.fvector('y') o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2) f = theano.function([y], o, mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert (len( [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 0)
import theano.tensor as T import numpy as np import odl import odl.contrib.theano # --- Wrap ODL operator as Theano operator --- # # Define ODL operator matrix = np.array([[1., 2.], [0., 0.], [0., 1.]]) odl_op = odl.MatrixOperator(matrix) # Define evaluation point x = [1., 2.] # Create Theano placeholders x_theano = T.fvector('x') # Create Theano layer from ODL operator odl_op_layer = odl.contrib.theano.TheanoOperator(odl_op) # Build computation graph y_theano = odl_op_layer(x_theano) y_theano_func = theano.function([x_theano], y_theano) # Evaluate using Theano and compare to odl_op(x) print('Theano eval : ', y_theano_func(x)) print('ODL eval : ', odl_op(x)) # --- Wrap ODL functional as Theano operator --- # # Define ODL cost and composed functional
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=500, char_nkerns=100, batch_size=1, window_width=3, emb_size=500, char_emb_size=100, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=6, neg_all=100, train_size=75893, test_size=19168, mark='_BiasedMaxPool_lr0.1_word500_char100_noDes_ent2.0' ): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files = [ 'annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.fromMo_FB5M.txt' ] rng = numpy.random.RandomState(23455) word2id, char2id = load_word2id_char2id(mark) # datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_test_or_valid(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size)#max_char_len, max_des_len, max_relation_len, max_Q_len datasets_test, length_per_example_test, word2id, char2id = load_test_or_valid( triple_files[1], char2id, word2id, max_char_len, max_des_len, max_relation_len, max_Q_len, test_size) vocab_size = len(word2id) char_size = len(char2id) print 'vocab_size:', vocab_size, 'char_size:', char_size # train_data=datasets # valid_data=datasets[1] test_data = datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # # train_pos_entity_char=train_data[0] # train_pos_entity_des=train_data[1] # train_relations=train_data[2] # train_entity_char_lengths=train_data[3] # train_entity_des_lengths=train_data[4] # train_relation_lengths=train_data[5] # train_mention_char_ids=train_data[6] # train_remainQ_word_ids=train_data[7] # train_mention_char_lens=train_data[8] # train_remainQ_word_len=train_data[9] # train_entity_scores=train_data[10] test_pos_entity_char = test_data[0] # test_pos_entity_des=test_data[1] test_relations = test_data[2] test_entity_char_lengths = test_data[3] # test_entity_des_lengths=test_data[4] test_relation_lengths = test_data[5] test_mention_char_ids = test_data[6] test_remainQ_word_ids = test_data[7] test_mention_char_lens = test_data[8] test_remainQ_word_len = test_data[9] test_entity_scores = test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 # train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ # len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] # if sum(train_sizes)/len(train_sizes)!=train_size: # print 'weird size:', train_sizes # exit(0) test_sizes=[len(test_pos_entity_char), len(test_relations), len(test_entity_char_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes) / len(test_sizes) != test_size: print 'weird size:', test_sizes exit(0) # n_train_batches=train_size/batch_size # n_test_batches=test_size/batch_size # train_batch_start=list(numpy.arange(n_train_batches)*batch_size) # test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) # indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) # indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) # indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) # indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) # indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) # indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) # indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) # indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) # indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) # indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt') embeddings = theano.shared(value=rand_values, borrow=True) char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() chosed_indices = T.ivector() ent_char_ids_M = T.imatrix() ent_lens_M = T.imatrix() men_char_ids_M = T.imatrix() men_lens_M = T.imatrix() rel_word_ids_M = T.imatrix() rel_word_lens_M = T.imatrix() #desH_word_ids_M=T.imatrix() #desH_word_lens_M=T.imatrix() q_word_ids_M = T.imatrix() q_word_lens_M = T.imatrix() ent_scores = T.fvector() filter_size = (emb_size, window_width) char_filter_size = (char_emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape = (char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b = create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) #q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [ char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b ] #, q_desH_conv_W, q_desH_conv_b] load_model_from_file(rootPath, params, mark) def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f, rel_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH #q_desH_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #desH_conv = Conv_with_input_para(rng, input=desH_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) ent_conv_pool = Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool = Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool = Average_Pooling_for_SimpleQA( rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1] + filter_size[1] - 1, dim=max_Q_len + filter_size[1] - 1, topk=2) #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55 # 0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling) # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan(SimpleQ_matches_Triple, sequences=[ ent_char_ids_M, ent_lens_M, rel_word_ids_M, rel_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M ]) simi_list += 0.2 * ent_scores posi_simi = simi_list[0] nega_simies = simi_list[1:] loss_simi_list = T.maximum( 0.0, margin - posi_simi.reshape((1, 1)) + nega_simies) loss_simi = T.sum(loss_simi_list) test_model = theano.function([ ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores ], [loss_simi, simi_list], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' start_time = time.clock() mid_time = start_time epoch = 0 test_loss = [] succ = 0 for i in range(test_size): #prepare data test_ent_char_ids_M = numpy.asarray(test_pos_entity_char[i], dtype='int32').reshape( (length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int32').reshape( (length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape( (length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape( (length_per_example_test[i], 3)) #test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int32').reshape((length_per_example_test[i], max_des_len)) #test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32').reshape( (length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i, simi_list_i = test_model( test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if len(simi_list_i) == 1 or simi_list_i[0] >= max(simi_list_i[1:]): succ += 1 if i % 1000 == 0: print 'testing', i, '...acc:', (succ * 1.0 / (i + 1)) * (19168 * 1.0 / 21687) succ = succ * 100.0 / 21687 #now, check MAP and MRR print 'accu:', succ # store_model_to_file(rootPath, params, succ, mark) print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
def ready(self): args = self.args w_emb_layer = self.w_emb_layer c_emb_layer = self.c_emb_layer r_emb_layers = self.r_emb_layers r_matrix_layers = self.r_matrix_layers char_dim = self.char_dim = args.char_dim char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim word_dim = self.word_dim = args.word_dim word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) word_ids = self.word_ids = T.ivector('word_ids') char_ids = self.char_ids = T.imatrix('char_ids') char_lens = self.char_lens = T.fvector('char_lens') char_masks = self.char_masks = T.imatrix('char_masks') up_ids = self.up_ids = T.imatrix('up_ids') up_rels = self.up_rels = T.imatrix('up_rels') up_id_masks = self.up_id_masks = T.imatrix('up_id_masks') down_ids = self.down_ids = T.imatrix('down_ids') down_rels = self.down_rels = T.imatrix('down_rels') down_id_masks = self.down_id_masks = T.imatrix('down_id_masks') tag_ids = self.tag_ids = T.ivector('tag_ids') layers = self.layers = [w_emb_layer, c_emb_layer] layers.extend(r_emb_layers) layers.extend(r_matrix_layers) inputs = self.inputs = [] inputs.append(self.word_ids) inputs.append(self.char_ids) inputs.append(self.char_lens) inputs.append(self.char_masks) inputs.append(self.up_ids) inputs.append(self.up_rels) inputs.append(self.up_id_masks) inputs.append(self.down_ids) inputs.append(self.down_rels) inputs.append(self.down_id_masks) inputs.append(self.tag_ids) wslices = w_emb_layer.forward(word_ids) cslices = c_emb_layer.forward(char_ids.ravel()) cslices = cslices.reshape((char_ids.shape[0], char_ids.shape[1], char_dim)) cslices = cslices.dimshuffle(1, 0, 2) bv_ur_slicess = [] bv_dr_slicess = [] b_ur_slicess = [] b_dr_slicess = [] bv_ur_matrixss = [] bv_dr_matrixss = [] b_ur_matrixss = [] b_dr_matrixss = [] for r_matrix_layer in r_matrix_layers: bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel()) bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel()) b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel()) b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel()) bv_ur_matrixss.append(bv_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) bv_dr_matrixss.append(bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) b_ur_matrixss.append(b_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) b_dr_matrixss.append(b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) for r_emb_layer in r_emb_layers: bv_ur_slices = r_emb_layer.forward(up_rels.ravel()) bv_dr_slices = r_emb_layer.forward(down_rels.ravel()) b_ur_slices = r_emb_layer.forward2(up_rels.ravel()) b_dr_slices = r_emb_layer.forward2(down_rels.ravel()) bv_ur_slicess.append(bv_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim))) bv_dr_slicess.append(bv_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim))) b_ur_slicess.append(b_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim))) b_dr_slicess.append(b_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim))) char_masks = char_masks.dimshuffle(1, 0) prev_output = wslices prev_size = word_dim if char_dim: layers.append(LSTM( n_in = char_dim, n_out = char_lstm_dim, direction = 'bi' if args.char_bidirect else 'si' )) prev_output_2 = cslices prev_output_2 = apply_dropout(prev_output_2, dropout, v2 = True) prev_output_2 = layers[-1].forward_all(cslices, char_masks) prev_output_2 = T.sum(prev_output_2, axis = 0) prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) + char_lens).dimshuffle(0, 'x') prev_size += char_lstm_dim prev_output = T.concatenate([prev_output, prev_output_2], axis = 1) prev_output = apply_dropout(prev_output, dropout) if args.conv != 0: for i in range(args.clayer): layers.append(GKNNMultiHeadGate( n_in = prev_size, n_out = prev_size, n_head = args.head )) prev_output = layers[-1].forward_all(prev_output, up_ids, up_id_masks, bv_ur_slicess[0], down_ids, down_id_masks, bv_dr_slicess[0]) prev_output = apply_dropout(prev_output, dropout) #prev_size *= 2 #layers.append(LSTM( # n_in = prev_size, # n_out = word_lstm_dim, # direction = 'bi' if args.word_bidirect else 'si' #)) #prev_output = prev_output.dimshuffle(0, 'x', 1) #prev_output = layers[-1].forward_all(prev_output) #prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[-1])) #prev_size = word_lstm_dim layers.append(Layer( n_in = prev_size, n_out = args.classes, activation = linear, #ReLU, has_bias = False )) n_tags = args.classes s_len = char_ids.shape[0] tags_scores = layers[-1].forward(prev_output) transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) pre_ids = T.arange(s_len + 1) s_ids = T.arange(s_len + 1) + 1 real_path_score += transitions[ padded_tags_ids[pre_ids], padded_tags_ids[s_ids] ].sum() all_paths_scores = CRFForward(observations, transitions) self.nll_loss = nll_loss = - (real_path_score - all_paths_scores) preds = CRFForward(observations, transitions, viterbi = True, return_alpha = False, return_best_sequence=True) self.pred = preds[1:-1] self.l2_sqr = None params = self.params = [transitions] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) #for l, i in zip(layers[3:], range(len(layers[3:]))): for l, i in zip(layers[2+len(r_emb_layers)+len(r_matrix_layers):], range(len(layers[2+len(r_emb_layers)+len(r_matrix_layers):]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams)) cost = self.nll_loss + self.l2_sqr lr_method_name = args.learning lr_method_parameters = {} lr_method_parameters['lr'] = args.learning_rate updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs = self.inputs, outputs = [cost, nll_loss], updates = updates, allow_input_downcast = True ) f_eval = theano.function( inputs = self.inputs[:-1], outputs = self.pred, allow_input_downcast = True ) return f_train, f_eval
beta = theano.shared( numpy.asarray(numpy.random.randn(784, 1), dtype=theano.config.floatX)) py_x = T.nnet.softmax(T.dot(X, beta)) y_pred = T.argmax(beta, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(py_x, y)) # energy function for normal distribution with normal momentum def normal_en(pos, mom): total_en = T.dot(pos, pos) / 2 + T.dot(mom, mom) / 2 f = theano.function([pos, mom], total_en) return (f) beta_0 = T.fvector() p_0 = T.fvector() en = lambda beta_0, p_0: T.dot(beta_0, beta_0) * 0.5 + T.dot(p_0, p_0) * 0.5 #en_f = theano.function([],en) def simulate_dynamics(initial_pos, initial_mom, stepsize, n_steps, energy_fn): def leapfrog(pos, mom, step): # from pos(t) and vel(t-stepsize//2), compute vel(t+stepsize//2) dE_dmom = T.grad(energy_fn(pos, mom), mom) new_pos = pos + step * dE_dmom dE_dpos = T.grad(energy_fn(new_pos, mom), new_pos) new_mom = mom - step * dE_dpos # from vel(t+stepsize//2) compute pos(t+stepsize)
def __init__(self, config): ModelBase.__init__(self) self.config = config self.verbose = self.config['verbose'] self.name = 'alexnet' batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] lib_conv = config['lib_conv'] n_softmax_out = config['n_softmax_out'] # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data x = T.ftensor4('x') y = T.lvector('y') rand = T.fvector('rand') lr = T.scalar('lr') if self.verbose: print 'AlexNet 2/16' self.layers = [] params = [] weight_types = [] if flag_datalayer: data_layer = DataLayer(input=x, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1_input = data_layer.output else: layer1_input = x convpool_layer1 = ConvPoolLayer(input=layer1_input, image_shape=(3, 227, 227, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output, image_shape=(96, 27, 27, batch_size), filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output, image_shape=(256, 13, 13, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096, verbose=self.verbose) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096, verbose=self.verbose) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096, verbose=self.verbose) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096, verbose=self.verbose) softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=n_softmax_out, verbose=self.verbose) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.p_y_given_x = softmax_layer8.p_y_given_x self.y_pred = softmax_layer8.y_pred self.output = self.p_y_given_x self.cost = softmax_layer8.negative_log_likelihood(y) self.error = softmax_layer8.errors(y) if n_softmax_out < 5: self.error_top_5 = softmax_layer8.errors_top_x(y, n_softmax_out) else: self.error_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params # inputs self.x = x self.y = y self.rand = rand self.lr = lr self.shared_x = theano.shared( np.zeros( (3, config['input_width'], config['input_height'], config['file_batch_size']), # for loading large batch dtype=theano.config.floatX), borrow=True) self.shared_y = theano.shared(np.zeros((config['file_batch_size'], ), dtype=int), borrow=True) self.shared_lr = theano.shared(np.float32(config['learning_rate'])) # training related self.base_lr = np.float32(config['learning_rate']) self.step_idx = 0 self.mu = config['momentum'] # def: 0.9 # momentum self.eta = config['weight_decay'] #0.0002 # weight decay self.weight_types = weight_types self.batch_size = batch_size self.grads = T.grad(self.cost, self.params) subb_ind = T.iscalar('subb') # sub batch index #print self.shared_x[:,:,:,subb_ind*self.batch_size:(subb_ind+1)*self.batch_size].shape.eval() self.subb_ind = subb_ind self.shared_x_slice = self.shared_x[:, :, :, subb_ind * self.batch_size:(subb_ind + 1) * self.batch_size] self.shared_y_slice = self.shared_y[subb_ind * self.batch_size:(subb_ind + 1) * self.batch_size]