def fit(self, train_triples, valid_triples, hparams, n=0,m=0,l=0, scorer = None): #Set input_dimensions: if n == 0: #No given dimensions, can be useful for transparent predicton of entities/rels not seen in train self.set_dims(train_triples, hparams) else: self.n, self.m, self.l, self.k = n, m, l, hparams.embedding_size #Define the downhill loss corresponding to the input dimensions self.setup_params_for_train(train_triples, valid_triples, hparams) #get the loss inputs: train_vals, train_symbs, valid_vals = self.get_loss_args_and_symb_vars(train_triples, valid_triples, hparams) opt = downhill.build(hparams.learning_rate_policy, loss=self.loss_to_opt, inputs=train_symbs, monitor_gradients=True) train_vals = downhill.Dataset(train_vals, name = 'train') #Main SGD loop it = 0 best_valid_mrr = -1 best_valid_ap = -1 for tm, vm in opt.iterate(train_vals, None, max_updates=hparams.max_iter, validate_every=9999999, #I take care of the valiation, with validation metrics instead of loss patience=9999999, #Number of tolerated imporvements of validation loss that are inferior to min_improvement max_gradient_norm=1, # Prevent gradient explosion! learning_rate=hparams.learning_rate): if it % hparams.valid_scores_every == 0 and scorer is not None: if valid_triples is not None: logger.info("Validation metrics:") res = scorer.compute_scores(self, self.name, hparams, valid_triples) cv_res = CV_Results() cv_res.add_res(res, self.name, hparams.embedding_size, hparams.lmbda, self.nb_params) if scorer.compute_ranking_scores: metrics = cv_res.print_MRR_and_hits() #Early stopping on filtered MRR if best_valid_mrr >= metrics[self.name][2]: logger.info("Validation filtered MRR decreased, stopping here.") break else: best_valid_mrr = metrics[self.name][2] else: logger.info("Validation AP: " + str(res.ap)) #Early stopping on Average Precision if best_valid_ap >= res.ap: logger.info("Validation AP decreased, stopping here.") break else: best_valid_ap = res.ap it += 1 if it >= hparams.max_iter: #Avoid downhill resetting the parameters when max_iter is reached break
def fit(self, train, entities, relations, param): self.n, self.m, self.l, self.k = entities, relations, entities, param.k self.setup(param) train, inputs = self.minibatch(train, param) opt = downhill.build(param.sgd, loss=self.loss_opt, inputs=inputs, monitor_gradients=True) train = downhill.Dataset(train, name='train') it = 0 for _ in opt.iterate(train, None, max_updates=param.epoch, validate_every=10, patience=5, max_gradient_norm=1, learning_rate=param.lr): it += 1 if it >= param.epoch: break
def dnn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rprop'): # GPU cached data _M = theano.shared(M.T.astype(float64)) dum = Th.vector('dum') # Get layer sizes K = [] for i in range(len(W1)): K.append([W1[i].shape[0], W2[i].shape[0]]) K.append([M.T.shape[1], M.T.shape[1]]) # We have weights to discover, init = 2/(Nin+Nout) H = theano.shared( sqrt(2. / (K[0][0] + K[0][1] + M.shape[1])) * random.rand(M.T.shape[0], K[0][0] + K[0][1]).astype(float64)) fI = InputLayer(shape=(M.T.shape[0], K[0][0] + K[0][1]), input_var=H) # Split in two pathways, one for each source's autoencoder H1 = (len(W1) + 1) * [None] H2 = (len(W1) + 1) * [None] H1[0] = SliceLayer(fI, indices=slice(0, K[0][0]), axis=1) H2[0] = SliceLayer(fI, indices=slice(K[0][0], K[0][0] + K[0][1]), axis=1) # Put the subsequent layers for i in range(len(W1)): H1[i + 1] = DenseLayer(H1[i], num_units=K[i + 1][0], W=W1[i].astype(float64), nonlinearity=lambda x: psoftplus(x, spb), b=None) H2[i + 1] = DenseLayer(H2[i], num_units=K[i + 1][1], W=W2[i].astype(float64), nonlinearity=lambda x: psoftplus(x, spb), b=None) # Add the two approximations R = ElemwiseSumLayer([H1[-1], H2[-1]]) # Cost function Ro = get_output(R) + eps cost = Th.mean(_M * (Th.log(_M + eps) - Th.log(Ro)) - _M + Ro) + 0 * Th.mean(dum) for i in range(len(H1) - 1): cost += sp * Th.mean(abs(get_output(H1[i]))) + sp * Th.mean( abs(get_output(H2[i]))) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([d]).astype(float64), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = nget(R, dum, array([0]).astype(float64)).T + eps _r1 = nget(H1[-1], dum, array([0]).astype(float64)).T _r2 = nget(H2[-1], dum, array([0]).astype(float64)).T return _r, _r1, _r2, er
def End2end_Early_stopping(self, numpy_rng, dataset, n_validate, data_name, batch_size, end2end_lr, algo, norm, patience, validation): train_X, test_X, actual = dataset valid_x = train_X.get_value()[:n_validate] train_x = train_X.get_value()[n_validate:] #train_x = train_x[:100] "for compute tm and vm before optimization process" t = theano.shared(numpy.asarray(train_x, dtype=theano.config.floatX), borrow=True) v = theano.shared(numpy.asarray(valid_x, dtype=theano.config.floatX), borrow=True) "Use downhill for training network" opt = downhill.build(algo=algo, params=self.params, loss=self.end2end_cost, inputs=[self.x]) train = downhill.Dataset(train_x, batch_size=batch_size, rng=numpy_rng) valid = downhill.Dataset(valid_x, batch_size=len(valid_x), rng=numpy_rng) "for monitoring before optimization process" stop_ep = 0 for tm1, vm1 in opt.iterate( train, valid, patience=patience, validate_every=validation, min_improvement=1e-3, #learning_rate = end2end_lr, momentum=0.0, nesterov=False): stop_ep = stop_ep + 1 # ## "******* Classification Results after End to End training ******" # if ((stop_ep%1 == 0) and (stop_ep > 0)): # lof,cen,dis,kde,svm05,svm01,ae = self.Compute_AUC_Hidden(train_X, test_X, actual, norm, data_name) # a = [stop_ep, lof, cen, dis, kde, svm05, svm01, ae] # monitor = np.append(monitor, a) if (stop_ep >= 1000): break #Plotting AUC and save to csv file # monitor = np.reshape(monitor, (-1,8)) # Plotting_Monitor(monitor, 0.4, 1.0, data_name, path) # np.savetxt(path + data_name + "_monitor_auc.csv", monitor, delimiter=",", fmt='%f' ) return [stop_ep, vm1['loss'], tm1['loss']]
def dnn_model(M, K=[20, 20], hh=.0001, ep=5000, d=0, wsp=0.0001, hsp=0, spb=3, bt=0, al='rprop'): # Sort out the activation from inspect import isfunction if isfunction(spb): act = spb else: act = lambda x: psoftplus(x, spb) # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=(None, M.shape[0]), input_var=_M) # Setup the layers L = K + [M.T.shape[1]] H = len(L) * [None] Hd = len(L) * [None] # First layer H[0] = DenseLayer(I, num_units=K[0], nonlinearity=act, b=None) # All the rest for k in range(1, len(L)): # Optional dropout Hd[k - 1] = DropoutLayer(H[k - 1], d) # Next layer H[k] = DenseLayer(Hd[k - 1], num_units=L[k], nonlinearity=act, b=None) # Cost function Ro = get_output(H[-1]) + eps cost = Th.mean(_M * (Th.log(_M + eps) - Th.log(Ro)) - _M + Ro) for k in range(len(L) - 1): cost += wsp * Th.mean(abs(H[k].W)) + hsp * Th.mean(get_output(H[k])) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(H[-1])) train = downhill.Dataset(M.T.astype(float64), batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation h = [nget(H[k], _M, M.T.astype(float64)).T for k in range(len(L))] w = [H[k].W.get_value() for k in range(len(L))] return h, w, er
def build_rosen(algo, name=True, monitor_gradients=False): x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x' if name else None) return downhill.build( algo, loss=(100 * (x[1:] - x[:-1]**2)**2 + (1 - x[:-1])**2).sum(), monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())], monitor_gradients=monitor_gradients, ), [[]]
def build_rosen(algo, name=True, monitor_gradients=False): x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x' if name else None) return downhill.build( algo, loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())], monitor_gradients=monitor_gradients, ), [[]]
def downhill_models(M, P, FE, z, K=20, hh=.001, ep=5000, dp=0, wsp=.001, plt=False): from paris.signal import bss_eval rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Shared variables to use x = Th.matrix('x') y = theano.shared(M.astype(theano.config.floatX)) d = theano.shared(float32(dp)) # Network weights W0 = theano.shared( sqrt(2. / (K + M.shape[0])) * random.randn(K, M.shape[0]).astype(theano.config.floatX)) W1 = theano.shared( sqrt(2. / (K + M.shape[0])) * random.randn(M.shape[0], K).astype(theano.config.floatX)) # First layer is the transform to a non-negative subspace h = psoftplus(W0.dot(x), 3.) # Dropout if dp > 0: h *= (1. / (1. - d) * (rng.uniform(size=h.shape) > d).astype( theano.config.floatX)).astype(theano.config.floatX) # Second layer reconstructs the input r = psoftplus(W1.dot(h), 3.) # Approximate input using KL-like distance cost = Th.mean(y * (Th.log(y + eps) - Th.log(r + eps)) - y + r) + wsp * Th.mean(abs(W1)) # Make an optimizer and define the training input opt = downhill.build('rprop', loss=cost, inputs=[x], params=[W0, W1]) train = downhill.Dataset(M.astype(theano.config.floatX), batch_size=0) # Train it downhill_train(opt, train, hh, ep, None) # Get approximation d = 0 _, _r = theano.function(inputs=[x], outputs=[h, r], updates=[])(M.astype(theano.config.floatX)) o = FE.ife(_r, P) sxr = bss_eval(o, 0, array([z])) return W1.get_value(), sxr
def build_rosen(algo): x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x') return downhill.build( algo, loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), params=[x], inputs=[], updates=(), monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())]), [[]]
def cnn_model(M, K=20, T=1, hh=.0001, ep=5000, d=0, hsp=0.0001, wsp=0, spb=3, bt=0, al='rprop'): # Facilitate reasonable convolutions core theano.config.dnn.conv.algo_fwd = 'fft_tiling' theano.config.dnn.conv.algo_bwd_filter = 'none' theano.config.dnn.conv.algo_bwd_data = 'none' # Reformat input data M3 = reshape(M.astype(float32), (1, M.shape[0], M.shape[1])) # Copy key variables to GPU _M = Th.tensor3('_M') # Input and forward transform I = InputLayer(shape=M3.shape, input_var=_M) # First layer is the transform to a non-negative subspace H = Conv1DLayer(I, filter_size=T, num_filters=K, pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) # Upper layer is the synthesizer R = Conv1DLayer(H, filter_size=T, num_filters=M.shape[0], pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) # Cost function Ro = get_output(R) + eps cost = Th.mean( _M*(Th.log( _M+eps) - Th.log( Ro)) - _M + Ro) \ + hsp*Th.mean( get_output( H)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M3, batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation and hidden state _r = squeeze(nget(R, _M, M3)) _h = squeeze(nget(H, _M, M3)) return _r, R.W.get_value(), er, _h
def End2end_Early_stopping(self, numpy_rng, dataset, n_validate, data_name, batch_size, end2end_lr, algo, norm, patience, validation): train_X, test_X, actual = dataset valid_x = train_X.get_value()[:n_validate] train_x = train_X.get_value()[n_validate:] "for compute tm and vm before optimization process" "Training network by downhill" #'adadelta' 'adagrad (default 0.01)' 'adam''esgd' 'nag''rmsprop' 'rprop' 'sgd' opt = downhill.build(algo=algo, params=self.params, loss=self.end2end_cost, inputs=[self.x]) train = downhill.Dataset(train_x, batch_size=batch_size, rng=numpy_rng) valid = downhill.Dataset(valid_x, batch_size=len(valid_x), rng=numpy_rng) "***** Monitor before optimization *****" stop_ep = 0 RE = np.empty([0, 3]) for tm, vm in opt.iterate( train, # 5, 5, 1e-2, 0.9 valid, patience=patience, # 10 validate_every=validation, # 5 min_improvement=1e-3, # 1e-3 #learning_rate = end2end_lr, # 1e-4 momentum=0.0, nesterov=False): stop_ep = stop_ep + 1 re = np.column_stack([stop_ep, vm['loss'], tm['loss']]) RE = np.append(RE, re) if (stop_ep >= 1000): break RE = np.reshape(RE, (-1, 3)) Plotting_End2End_RE(RE, stop_ep, 0.0, 0.4, data_name, path) np.savetxt(path + data_name + "_training_error1.csv", RE, delimiter=",", fmt='%f') np.set_printoptions(precision=6, suppress=True) print("\n ", RE[stop_ep - 1]) return RE[stop_ep - 1]
def lasagne_models(M, P, FE, z, K=20, hh=.0001, ep=5000, d=0, wsp=0.0001, plt=True): from paris.signal import bss_eval # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=M.T.shape, input_var=_M) # First layer is the transform to a non-negative subspace H0 = DenseLayer(I, num_units=K, nonlinearity=lambda x: psoftplus(x, 3.), b=None) # Optional dropout H = DropoutLayer(H0, d) # Compute source modulator R = DenseLayer(H, num_units=M.T.shape[1], nonlinearity=lambda x: psoftplus(x, 3.), b=None) # Cost function cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \ + wsp*Th.mean( abs( R.W)) # Train it using Lasagne opt = downhill.build('rprop', loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M.T.astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None)[-1] # Get approximation _r = nget(R, _M, M.T.astype(float32)).T _h = nget(H, _M, M.T.astype(float32)).T o = FE.ife(_r, P) sxr = bss_eval(o, 0, array([z])) return R, sxr
def pretrain_Early_stopping(self, numpy_rng, train_set, n_validate, data_name, batch_size, pre_lr, corruptions): RE = np.empty([10000, self.n_layers]) stop_epoch = np.empty([self.n_layers]) for i in range(self.n_layers): cost, updates = self.dA_layers[i].get_cost_updates( corruptions[i], pre_lr) if (i == 0): train_x1 = train_set.get_value() else: train_x1 = self.get_hidden_i(train_set, i - 1) valid_x = train_x1[:n_validate] train_x = train_x1[n_validate:] # adadelta, 'adagrad (default 0.01)' 'adam''esgd' 'nag''rmsprop' 'rprop' 'sgd' opt = downhill.build(algo='sgd', params=self.dA_layers[i].params, loss=cost) train = downhill.Dataset(train_x, batch_size=batch_size, rng=numpy_rng) valid = downhill.Dataset(valid_x, batch_size=len(valid_x), rng=numpy_rng) epoch = 0 re = np.empty([10000]) for tm1, vm1 in opt.iterate( train, valid, patience=100, #100 validate_every=5, #5 min_improvement=1e-3, #4 learning_rate=pre_lr, #1e-2 momentum=0.0, nesterov=False): re[epoch] = tm1['loss'] epoch = epoch + 1 if (epoch == 200): break RE[:, i] = re stop_epoch[i] = epoch print(' + Stopping epoch:', stop_epoch) Plotting_Pre_RE1(RE, stop_epoch, self.n_layers, 0.0, 0.1, batch_size, data_name, path)
def nn_model(M, K=20, hh=.0001, ep=5000, d=0, wsp=0.0001, hsp=0, spb=3, bt=0, al='rprop'): # Sort out the activation from inspect import isfunction if isfunction(spb): act = spb else: act = lambda x: psoftplus(x, spb) # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=(None, M.shape[0]), input_var=_M) # First layer is the transform to a non-negative subspace H0 = DenseLayer(I, num_units=K, nonlinearity=act, b=None) # Optional dropout H = DropoutLayer(H0, d) # Compute output R = DenseLayer(H, num_units=M.T.shape[1], nonlinearity=act, b=None) # Cost function Ro = get_output(R) + eps cost = Th.mean( _M*(Th.log( _M+eps) - Th.log( Ro)) - _M + Ro) \ + wsp*Th.mean( abs( R.W[0])) + hsp*Th.mean( get_output( H0)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M.T.astype(float64), batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation _r = nget(R, _M, M.T.astype(float64)).T _h = nget(H, _M, M.T.astype(float64)).T return _r, R.W.get_value(), er, _h
def rnn_model(M, K=20, hh=.0001, ep=5000, d=0, wsp=0.0001, hsp=0, spb=3, bt=0, al='rmsprop', t=5): # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=(None, M.shape[0]), input_var=_M) # First layer is the transform to a non-negative subspace H0 = DenseLayer(I, num_units=K, nonlinearity=lambda x: psoftplus(x, spb), b=None) # Optional dropout H = DropoutLayer(H0, d) # Compute output R = RecurrentLayer(H, num_units=M.T.shape[1], nonlinearity=lambda x: psoftplus(x, spb), gradient_steps=t, b=None) # Cost function Ro = get_output(R) + eps cost = Th.mean( _M*(Th.log( _M+eps) - Th.log( Ro)) - _M + Ro) \ + hsp*Th.mean( get_output( H0)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M.T.astype(float32), batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation _r = nget(R, _M, M.T.astype(float32)).T _h = nget(H, _M, M.T.astype(float32)).T return _r, (R.W_in_to_hid.get_value(), R.W_hid_to_hid.get_value()), er, _h
def build_factor(algo): a = np.arange(1000).reshape((100, 10)).astype('f') b = 0.1 + np.zeros((10, 100), 'f') x = TT.matrix('x') u = theano.shared(a, name='u') v = theano.shared(0.1 + b, name='v') return downhill.build( algo, loss=TT.sum(TT.sqr(x - TT.dot(u, v))), monitors=[ ('u<1', (u < 1).mean()), ('u<-1', (u < -1).mean()), ('v<1', (v < 1).mean()), ('v<-1', (v < -1).mean()), ]), [[np.dot(a, b) + np.random.randn(100, 100).astype('f')] for _ in range(10)]
def build_model(algo): loss_value = [] W1.set_value(W1_val) b1.set_value(b1_val) W2.set_value(W2_val) b2.set_value(b2_val) opt = downhill.build(algo, loss=loss) train = downhill.Dataset([train_X[:-1000], train_y_onehot[:-1000]], batch_size=1, iteration_size=1) valid = downhill.Dataset([train_X[-1000:], train_y_onehot[-1000:]]) iterations = 0 for tm, vm in opt.iterate(train, valid, patience=1000): iterations += 1 loss_value.append(vm['loss']) if iterations > 1000: break return loss_value
def build(algo, init): '''Build and return an optimizer for the rosenbrock function. In downhill, an optimizer can be constructed using the build() top-level function. This function requires several Theano quantities such as the loss being optimized and the parameters to update during optimization. ''' x = theano.shared(np.array(init, 'f'), name='x') n = 0.1 * RandomStreams().normal((len(init) - 1, )) monitors = [] if len(init) == 2: # this gives us access to the x and y locations during optimization. monitors.extend([('x', x[:-1].sum()), ('y', x[1:].sum())]) return downhill.build( algo, loss=(n + 100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), params=[x], monitors=monitors, monitor_gradients=True)
def build(algo, init): '''Build and return an optimizer for the rosenbrock function. In downhill, an optimizer can be constructed using the build() top-level function. This function requires several Theano quantities such as the loss being optimized and the parameters to update during optimization. ''' x = theano.shared(np.array(init, 'f'), name='x') monitors = [] if len(init) == 2: # this gives us access to the x and y locations during optimization. monitors.extend([('x', x[:-1].sum()), ('y', x[1:].sum())]) return downhill.build( algo, loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), params=[x], inputs=[], monitors=monitors, monitor_gradients=True)
def itertrain(self, train, valid=None, **kwargs): '''Train a model using a training and validation set. This method yields a series of monitor values to the caller. After every iteration, a pair of monitor dictionaries is generated: one evaluated on the training dataset, and another evaluated on the validation dataset. The validation monitors might not be updated during every training iteration; in this case, the most recent validation monitors will be yielded along with the training monitors. Parameters ---------- train : :class:`Dataset <theanets.dataset.Dataset>` A set of training data for computing updates to model parameters. valid : :class:`Dataset <theanets.dataset.Dataset>` A set of validation data for computing monitor values and determining when the loss has stopped improving. Yields ------ training : dict A dictionary mapping monitor names to values, evaluated on the training dataset. validation : dict A dictionary containing monitor values evaluated on the validation dataset. ''' for monitors in downhill.build( algo=self.algo, loss=self.network.loss(**kwargs), updates=self.network.updates(**kwargs), monitors=self.network.monitors(**kwargs), inputs=self.network.variables, params=self.network.params, monitor_gradients=kwargs.get('monitor_gradients', False), ).iterate(train, valid=valid, **kwargs): yield monitors
import downhill import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import theano x = theano.shared(np.array([-1, 0], 'f'), name='x') opt = downhill.build('nag', loss=(100 * (x[1:] - x[:-1]**2)**2 + (1 - x[:-1])**2).sum(), params=[x], inputs=[], monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())], monitor_gradients=True) xs, ys, loss = [], [], [] for tm, _ in opt.iterate([[]], learning_rate=0.001, momentum=0.95, max_gradient_norm=100): xs.append(tm['x']) ys.append(tm['y']) loss.append(tm['loss']) if len(loss) == 300: break ax = plt.axes(projection='3d') c = '#d62728' ax.plot(xs,
def downhill_separate(M, P, FE, W1, W2, z1, z2, hh=.001, ep=5000, d=0, wsp=.0001, plt=True): from paris.signal import bss_eval # Get dictionary sizes K = [W1.shape[1], W2.shape[1]] # Cache some things y = Th.matrix('y') w1 = theano.shared(W1.astype(theano.config.floatX), 'w1') w2 = theano.shared(W2.astype(theano.config.floatX), 'w2') # Activations to learn h1 = theano.shared( sqrt(2. / (K[0] + M.shape[1])) * random.randn(K[0], M.shape[1]).astype(theano.config.floatX)) h2 = theano.shared( sqrt(2. / (K[1] + M.shape[1])) * random.randn(K[1], M.shape[1]).astype(theano.config.floatX)) # Dropout if d > 0: dw1 = w1 * 1. / (1. - d) * (rng.uniform(size=w1.shape) > d).astype( theano.config.floatX) dw2 = w2 * 1. / (1. - d) * (rng.uniform(size=w2.shape) > d).astype( theano.config.floatX) else: dw1 = w1 dw2 = w2 # Approximate input r1 = psoftplus(dw1.dot(h1), 3.) r2 = psoftplus(dw2.dot(h2), 3.) r = r1 + r2 # KL-distance to input cost = Th.mean( y * (Th.log( y+eps) - Th.log( r+eps)) - y + r) \ + wsp*(Th.mean( abs( h1)) + Th.mean( abs( h2))) # Make it callable and derive updates ffwd_f = theano.function(inputs=[], outputs=[r1, r2, h1, h2], updates=[]) # Make an optimizer and define the inputs opt = downhill.build('rprop', loss=cost, inputs=[y], params=[h1, h2]) train = downhill.Dataset(M.astype(theano.config.floatX), batch_size=0) # Train it cst = downhill_train(opt, train, hh, ep, None) # So what happened? d = 0 _r1, _r2, _h1, _h2 = ffwd_f() _r = _r1 + _r2 + eps o1 = FE.ife(_r1 * (M / _r), P) o2 = FE.ife(_r2 * (M / _r), P) sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2))) # Return things of note return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
batch_size = 1 z1 = X.dot(W1) + b1 a1 = T.tanh(z1) z2 = a1.dot(W2) + b2 y_hat = T.nnet.softmax(z2) loss_reg = 1. / batch_size * reg_lambda / 2 * (T.sum(T.sqr(W1)) + T.sum(T.sqr(W2))) loss = T.nnet.categorical_crossentropy(y_hat, y).mean() + loss_reg prediction = T.argmax(y_hat, axis=1) predict = theano.function([X], prediction) train_loss = [] validation_loss = [] opt = downhill.build('adadelta', loss=loss) train = downhill.Dataset([train_X[:-1000], train_y_onehot[:-1000]], batch_size=batch_size, iteration_size=1) valid = downhill.Dataset([train_X[-1000:], train_y_onehot[-1000:]]) iterations = 0 for tm, vm in opt.iterate(train, valid, patience=1000): iterations += 1 train_loss.append(tm['loss']) validation_loss.append(vm['loss']) if iterations > 5000: break x_min, x_max = train_X[:, 0].min() - 0.5, train_X[:, 0].max() + 0.5 y_min, y_max = train_X[:, 1].min() - 0.5, train_X[:, 1].max() + 0.5 x_mesh, y_mesh = numpy.meshgrid(numpy.arange(x_min, x_max, 0.01),
batch_size = 1 #Our Loss function z1 = X.dot(W1) + b1 a1 = T.tanh(z1) z2 = a1.dot(W2) + b2 y_hat = T.nnet.softmax(z2) loss_reg = 1./batch_size * reg_lambda/2 * (T.sum(T.sqr(W1)) + T.sum(T.sqr(W2))) loss = T.nnet.categorical_crossentropy(y_hat, y).mean() + loss_reg prediction = T.argmax(y_hat, axis=1) predict = theano.function([X], prediction) #Store the training and vlidation loss train_loss = [] validation_loss = [] opt = downhill.build('sgd', loss=loss) #Set up training and validation dataset splits, use only one example in a batch #and use only one batch per step/epoc #Use everything except last 1000 examples for training train = downhill.Dataset([train_X[:-1000], train_y_onehot[:-1000]], batch_size=batch_size, iteration_size=1) #Use last 1000 examples for valudation valid = downhill.Dataset([train_X[-1000:], train_y_onehot[-1000:]]) #SGD iterations = 0 for tm, vm in opt.iterate(train, valid, patience=10000): iterations += 1 # Record the training and validation loss train_loss.append(tm['loss']) validation_loss.append(vm['loss']) if iterations > 1000:
def cnn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rprop'): # Facilitate reasonable convolutions core theano.config.dnn.conv.algo_fwd = 'fft_tiling' theano.config.dnn.conv.algo_bwd_filter = 'none' theano.config.dnn.conv.algo_bwd_data = 'none' # Reformat input data M3 = reshape(M.astype(float32), (1, M.shape[0], M.shape[1])) # Copy key variables to GPU _M = theano.shared(M3.astype(float32)) # Get dictionary shapes K = [W1.shape[1], W2.shape[1]] T = W1.shape[2] # We have weights to discover H = theano.shared( sqrt(2. / (K[0] + K[1] + M.shape[1])) * random.rand(1, K[0] + K[1], M.T.shape[0]).astype(float32)) fI = InputLayer(shape=(1, K[0] + K[1], M.T.shape[0]), input_var=H) # Split in two pathways H1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) H2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Compute source modulators using previously learned convolutional dictionaries R1 = Conv1DLayer(H1, filter_size=T, W=W1, num_filters=M.shape[0], pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) R2 = Conv1DLayer(H2, filter_size=T, W=W2, num_filters=M.shape[0], pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) # Add the two approximations R = ElemwiseSumLayer([R1, R2]) # Cost function dum = Th.vector('dum') Ro = get_output(R) + eps cost = Th.mean(_M * (Th.log(_M + eps) - Th.log(Ro)) - _M + Ro) + 0 * Th.mean(dum) + sp * Th.mean(abs(H)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([0]).astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = squeeze(nget(R, dum, array([0]).astype(float32))) + eps _r1 = squeeze(nget(R1, dum, array([0]).astype(float32))) _r2 = squeeze(nget(R2, dum, array([0]).astype(float32))) return _r, _r1, _r2, er
def build_rosen(algo): x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x') return downhill.build( algo, loss=(100 * (x[1:] - x[:-1]**2)**2 + (1 - x[:-1])**2).sum(), monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())]), [[]]
def nn_model_ae(x, Kx, learning_rate=.001, ep=5000, dp=0.0, spb=3, al='rprop'): # Train NSAE for Source 1 # Define NMF network rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Latent dimensions def pl(): clf() gcf().set_size_inches(6, 2) semilogy(cst) grid('on') title('Cost: %f, Epoch: %d' % (cst[-1], len(cst))) drawnow() # Dropout parameters d = theano.shared(float64(dp)) # I/O container X = theano.tensor.matrix('X') # Weight matrices W1x = theano.shared(random.rand(Kx, x.shape[0]).astype(float64)) W2x = theano.shared(random.rand(x.shape[0], Kx).astype(float64)) # Get latent variables Hx = psoftplus(W1x.dot(X), spb) # Hx = act( W1x.dot( X)) # Dropout if dp > 0: Hx *= (1. / (1. - d) * (rng.uniform(size=Hx.shape) > d).astype( theano.config.floatX)).astype(theano.config.floatX) # Get approximation Zx = psoftplus(W2x.dot(Hx), spb) # Zx = act( W2x.dot( Hx)) # Low rank reconstruction should match smoothed amplitudes, use sparse W1 cost = theano.tensor.mean( X * (theano.tensor.log( X+eps) - theano.tensor.log( Zx+eps)) - X + Zx) \ + 1*theano.tensor.mean( abs( W2x)**2) +0.01*theano.tensor.mean( abs( Hx)) # Make an optimizer and define the inputs opt = downhill.build(al, loss=cost, params=[W1x, W2x], inputs=[X]) train = downhill.Dataset(x.astype(float64), batch_size=x.shape[0]) # Train and show me the progress cst = [] lt = time.time() for tm, _ in opt.iterate(train, learning_rate=.001, max_updates=ep, patience=ep): cst.append(tm['loss']) if time.time() - lt > 4: pl() lt = time.time() pl() # Show me nn_nmf = theano.function(inputs=[X], outputs=[Zx, Hx, W2x], updates=[]) z, h, w = nn_nmf(x.astype(float64)) subplot(2, 1, 1) imagesc(x**.4) title('Input 1') subplot(2, 1, 2) imagesc(z**.4) title('Approximation') subplot(2, 2, 3) plot(W2x.get_value()) title('NN bases') subplot(2, 2, 4) plot(h.T) title('Latent representation') tight_layout() return w, z
import climate import downhill import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import theano climate.enable_default_logging() x = theano.shared(np.array([-1, 0], 'f'), name='x') opt = downhill.build( 'nag', loss=(100 * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum(), params=[x], inputs=[], monitors=[('x', x[:-1].sum()), ('y', x[1:].sum())], monitor_gradients=True) xs, ys, loss = [], [], [] for tm, _ in opt.iterate([[]], learning_rate=0.001, momentum=0.95, max_gradient_norm=100): xs.append(tm['x']) ys.append(tm['y']) loss.append(tm['loss']) if len(loss) == 300: break ax = plt.axes(projection='3d')
def train(data_dir='data/smrt/', dim_proj=64, dim_att=32, maxlen=30, batch_size=256, keep_ratio=1., shuffle_data=True, learning_rate=0.001, global_steps=50000, disp_freq=100, save_freq=100, test_freq=100, saveto_file='params.npz', tmsaveto_file='timeparams.npz', weight_decay=0.0005, sigmasqr = 1, tdim = 1., reload_model=True, train=True): """ MRSRMTPP model training. tdim: scale time down by how many times """ options = locals().copy() #savedstep = '0' saveto = data_dir + saveto_file tmsaveto = data_dir + tmsaveto_file # for earlystopping best_map = 0 prev_map = 0.001 # loads graph Gp, Gs, Gi, node_index = data_utilsSMRT.load_graph(data_dir) #print nx.info(G) options['n_events'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) if reload_model: print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) timeparams = init_timeparams(options) if reload_model: print 'reusing saved model.' load_params(tmsaveto, timeparams) timetparams = init_tparams(timeparams) # builds MRSRMTPP model print 'Building model...' model = tpgruSMRT_model.build_model(tparams, timetparams, options) print 'Loading test data...' test_examples = data_utilsSMRT.load_examples(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, Gp=Gp, Gs=Gs, Gi=Gi) test_loader = data_utilsSMRT.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) if train: # prepares training data. print 'Loading train data...' train_examples = data_utilsSMRT.load_examples(data_dir, dataset='train', keep_ratio=options[ 'keep_ratio'], node_index=node_index, maxlen=maxlen, Gp=Gp, Gs=Gs, Gi=Gi) train_loader = data_utilsSMRT.Loader(train_examples, options=options) print 'Loaded %d training examples.' % len(train_examples) print 'Loading valid data...' valid_examples = data_utilsSMRT.load_examples(data_dir, dataset='valid', keep_ratio=options[ 'keep_ratio'], node_index=node_index, maxlen=maxlen, Gp=Gp) valid_loader = data_utilsSMRT.Loader(valid_examples, options=options) print 'Loaded %d validation examples.' % len(valid_examples) # compiles updates. optimizer = downhill.build(algo='adam', loss=model['cost'], params=tparams.values(), inputs=model['data']) updates = optimizer.get_updates(max_gradient_elem=5., learning_rate=learning_rate) f_update = theano.function(model['data'], model['cost'], updates=list(updates)) toptimizer = downhill.build(algo='adam', loss=model['timecost'], params=timetparams.values(), inputs=model['timedata']) tupdates = toptimizer.get_updates(max_gradient_elem=5., learning_rate=0.0001) f_t_update = theano.function(model['timedata'], model['timecost'], updates=list(tupdates)) # training loop. start_time = timeit.default_timer() n_examples = len(train_examples) batches_per_epoch = n_examples // options['batch_size'] + 1 n_epochs = global_steps // batches_per_epoch + 1 global_step = 0 #cost_history = [] for _ in range(n_epochs): for _ in range(batches_per_epoch): batch_data = train_loader() cost = f_update(*(batch_data[:-3]+(batch_data[-2],))) #cost_history += [cost] timecost = f_t_update(*(batch_data[:-2]+(batch_data[-1],))) if global_step % disp_freq == 0: print 'global step %d, cost: %f' % (global_step, cost) print 'timecost: %f' % (timecost) # dump model parameters. if global_step % save_freq == 0: eva_map = evaluate_eval(model['f_prob'], valid_loader, model['f_tprob'], options['tdim']) if (eva_map > best_map): best_map = eva_map params = unzip(tparams) np.savez(data_dir + saveto_file, **params) pickle.dump(options, open('%s.pkl' % (data_dir + saveto_file), 'wb'), -1) timeparams = unzip(timetparams) np.savez(data_dir + tmsaveto_file, **timeparams) if (abs(eva_map - prev_map) / prev_map < 0.001): scores = evaluate(model['f_prob'], test_loader, model['f_tprob'], options['tdim']) pprint.pprint(scores) return 0 else: prev_map = eva_map global_step += 1
def lasagne_separate(M, P, FE, W1, W2, z1, z2, hh=.0001, ep=5000, d=0, wsp=.0001, plt=True): # Gt dictionary shapes K = [W1.shape[0], W2.shape[0]] # GPU cached data _M = theano.shared(M.astype(float32)) # Input is the learned dictionary set lW = hstack((W1.T, W2.T)).astype(float32) _lW = Th.matrix('_lW') fI = InputLayer(shape=lW.shape, input_var=_lW) # Split in two paths fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, d) dfW2 = DropoutLayer(fW2, d) N_sequence = 10 # # Compute source modulators # R1 = LSTMLayer(dfW1, N_sequence) # R2 = LSTMLayer(dfW2, N_sequence) # Bring to standard orientation R = ElemwiseSumLayer([R1, R2]) # Cost function cost = ( _M * (Th.log(_M + eps) - Th.log(get_output(R) + eps)) - _M + get_output(R)).mean() + wsp * (Th.mean(abs(R1.W)) + Th.mean(abs(R2.W))) # Train it using Lasagne opt = downhill.build('rprop', loss=cost, inputs=[_lW], params=get_all_params(R)) train = downhill.Dataset(lW, batch_size=0) er = downhill_train(opt, train, hh, ep, None)[-1] # Get outputs _r = nget(R, _lW, lW) + eps _r1 = nget(R1, _lW, lW) _r2 = nget(R2, _lW, lW) o1 = FE.ife(_r1 * (M / _r), P) o2 = FE.ife(_r2 * (M / _r), P) sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2))) return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
batch_size = 1 #Our Loss function z1 = X.dot(W1) + b1 a1 = T.tanh(z1) z2 = a1.dot(W2) + b2 y_hat = T.nnet.softmax(z2) loss_reg = 1. / batch_size * reg_lambda / 2 * (T.sum(T.sqr(W1)) + T.sum(T.sqr(W2))) loss = T.nnet.categorical_crossentropy(y_hat, y).mean() + loss_reg prediction = T.argmax(y_hat, axis=1) predict = theano.function([X], prediction) #Store the training and vlidation loss train_loss = [] validation_loss = [] opt = downhill.build('sgd', loss=loss) #Set up training and validation dataset splits, use only one example in a batch #and use only one batch per step/epoc #Use everything except last 1000 examples for training train = downhill.Dataset([train_X[:-1000], train_y_onehot[:-1000]], batch_size=batch_size, iteration_size=1) #Use last 1000 examples for valudation valid = downhill.Dataset([train_X[-1000:], train_y_onehot[-1000:]]) #SGD iterations = 0 for tm, vm in opt.iterate(train, valid, patience=10000): iterations += 1 # Record the training and validation loss train_loss.append(tm['loss']) validation_loss.append(vm['loss'])
def nn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rprop'): # Sort out the activation from inspect import isfunction if isfunction(spb): act = spb else: act = lambda x: psoftplus(x, spb) # Get dictionary shapes K = [W1.shape[0], W2.shape[0]] # GPU cached data _M = theano.shared(M.T.astype(float64)) dum = Th.vector('dum') # We have weights to discover H = theano.shared( sqrt(2. / (K[0] + K[1] + M.shape[1])) * random.rand(M.T.shape[0], K[0] + K[1]).astype(float64)) fI = InputLayer(shape=(M.T.shape[0], K[0] + K[1]), input_var=H) # Split in two pathways fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, dum[0]) dfW2 = DropoutLayer(fW2, dum[0]) # Compute source modulators using previously learned dictionaries R1 = DenseLayer(dfW1, num_units=M.T.shape[1], W=W1.astype(float64), nonlinearity=act, b=None) R2 = DenseLayer(dfW2, num_units=M.T.shape[1], W=W2.astype(float64), nonlinearity=act, b=None) # Add the two approximations R = ElemwiseSumLayer([R1, R2]) # Cost function Ro = get_output(R) + eps cost = (_M*(Th.log(_M+eps) - Th.log( Ro+eps)) - _M + Ro).mean() \ + sp*Th.mean( abs( H)) + 0*Th.mean( dum) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) #train = downhill.Dataset( array( [0]).astype(float32), batch_size=0) if isinstance(d, list): train = downhill.Dataset(array([d[0]]).astype(float64), batch_size=0) er = downhill_train(opt, train, hh, ep / 2, None) train = downhill.Dataset(array([d[1]]).astype(float64), batch_size=0) er += downhill_train(opt, train, hh, ep / 2, None) else: train = downhill.Dataset(array([d]).astype(float64), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = nget(R, dum, array([0]).astype(float64)).T + eps _r1 = nget(R1, dum, array([0]).astype(float64)).T _r2 = nget(R2, dum, array([0]).astype(float64)).T return _r, _r1, _r2, er
def nn_sep_ae(m, w1, w2, hh=.001, ep=5000, sp=.1, dp=0.0, spb=3, al='rprop'): from numpy import random import theano # from matplotlib.pyplot import gcf, clf, semilogy, grid, title, show from deep_sep_expr3 import downhill_train rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Dropout parameters d = theano.shared(float32(dp)) # Plot to make while training def pl(): clf() gcf().set_size_inches(6, 2) semilogy(cst) grid('on') title('Cost: %f, Epoch: %d' % (cst[-1], len(cst))) drawnow() # Sort out the activation from inspect import isfunction if isfunction(spb): act = spb else: act = lambda x: psoftplus(x, spb) w_cat = hstack((w1, w2)) K = [w1.shape[1], w2.shape[1]] # W2m = theano.shared(w_cat.astype(float64)) W1m = theano.shared( random.rand(w_cat.shape[1], w_cat.shape[0]).astype(float64)) # W1z = theano.shared((linalg.pinv(w_cat)).astype(float64)) M = theano.tensor.matrix('M') Hm = psoftplus(W1m.dot(M), spb) # Dropout if dp > 0: Hm *= (1. / (1. - d) * (rng.uniform(size=Hm.shape) > d).astype( theano.config.floatX)).astype(theano.config.floatX) W2s1 = theano.shared( hstack((w_cat[:, 0:K[0]], zeros(w2.shape))).astype(float64)) W2s2 = theano.shared( hstack((zeros(w1.shape), w_cat[:, K[0]:K[0] + K[1]])).astype(float64)) M1 = psoftplus(W2s1.dot(Hm), spb) M2 = psoftplus(W2s2.dot(Hm), spb) M_out = M1 + M2 # -------------or---------------- # M_out = psoftplus((W2s1 + W2s2).dot( Hm),spb); # M2 = psoftplus(M_out - psoftplus(W2s1.dot(Hm),spb),spb); # M1 = psoftplus(M_out - psoftplus(W2s2.dot(Hm),spb),spb); cost = theano.tensor.mean( M_out * (theano.tensor.log( M_out+eps) - theano.tensor.log( M+eps)) - M_out + M) \ + 0.01*theano.tensor.mean( abs( W1m)**1) + 0.01*theano.tensor.mean( abs( Hm)**1) #cost = theano.tensor.mean( M * (theano.tensor.log( M+eps) - theano.tensor.log( M_out+eps)) - M + M_out) \ # + 0.1*theano.tensor.mean( abs( Hm)**1) + 1*theano.tensor.mean( abs( W2m)**2) opt = downhill.build(al, loss=cost, params=[W1m], inputs=[M]) # params = W1m train = downhill.Dataset(m.astype(float64), batch_size=m.shape[0]) # batch_size = m.shape[0] cst = [] lt = time.time() for tm, _ in opt.iterate(train, learning_rate=hh, max_updates=ep, patience=ep): cst.append(tm['loss']) if time.time() - lt > 2: pl() lt = time.time() pl() # W2s1 = theano.shared(hstack((w1,zeros(w2.shape))).astype(float64)); # W2s2 = theano.shared(hstack((zeros(w1.shape),w2)).astype(float64)); #W2s1 = theano.shared(hstack((W2m.eval()[:,0:K[0]],zeros(w2.shape))).astype(float64)); #W2s2 = theano.shared(hstack((zeros(w1.shape),W2m.eval()[:,K[0]:K[0]+K[1]])).astype(float64)); #M1 = psoftplus(W2s1.dot(Hm),spb); #M2 = psoftplus(W2s2.dot(Hm),spb); nn_nmf_sep = theano.function(inputs=[M], outputs=[Hm, M1, M2, M_out], updates=[]) h1m, m1, m2, m_out = nn_nmf_sep(m.astype(float64)) subplot(2, 1, 1) imagesc(m1**.4) title('Source 1') subplot(2, 1, 2) imagesc(m2**.4) title('Source 2') # subplot( 2, 2, 3); plot( h1z[0:Kx].T); title( 'Latent representation for Source 1'); # subplot( 2, 2, 4); plot( h1z[Kx:Kx+Ky].T); title( 'Latent representation for Source 2'); tight_layout() return m_out, m1, m2, cst
def train(data_dir='data/memes/', dim_proj=512, maxlen=30, batch_size=256, keep_ratio=1., shuffle_data=True, learning_rate=0.001, global_steps=50000, disp_freq=100, save_freq=1000, test_freq=1000, saveto_file='params.npz', weight_decay=0.0005, reload_model=False, train=True): """ Topo-LSTM model training. """ options = locals().copy() saveto = data_dir + saveto_file # loads graph G, node_index = data_utils.load_graph(data_dir) print nx.info(G) options['n_words'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) if reload_model: print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) # builds Topo-LSTM model print 'Building model...' model = tprnn_model.build_model(tparams, options) print 'Loading test data...' test_examples = data_utils.load_examples(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, G=G) test_loader = data_utils.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) if train: # prepares training data. print 'Loading train data...' train_examples = data_utils.load_examples( data_dir, dataset='train', keep_ratio=options['keep_ratio'], node_index=node_index, maxlen=maxlen, G=G) train_loader = data_utils.Loader(train_examples, options=options) print 'Loaded %d training examples.' % len(train_examples) # compiles updates. optimizer = downhill.build(algo='adam', loss=model['cost'], params=tparams.values(), inputs=model['data']) updates = optimizer.get_updates(max_gradient_elem=5., learning_rate=learning_rate) f_update = theano.function(model['data'], model['cost'], updates=list(updates)) # training loop. start_time = timeit.default_timer() # downhill.minimize( # loss=cost, # algo='adam', # train=train_loader, # # inputs=input_list + [labels], # # params=tparams.values(), # # patience=0, # max_gradient_clip=1, # # max_gradient_norm=1, # learning_rate=learning_rate, # monitors=[('cost', cost)], # monitor_gradients=False) n_examples = len(train_examples) batches_per_epoch = n_examples // options['batch_size'] + 1 n_epochs = global_steps // batches_per_epoch + 1 global_step = 0 cost_history = [] for _ in range(n_epochs): for _ in range(batches_per_epoch): cost = f_update(*train_loader()) cost_history += [cost] if global_step % disp_freq == 0: print 'global step %d, cost: %f' % (global_step, cost) # dump model parameters. if global_step % save_freq == 0: params = unzip(tparams) np.savez(saveto, **params) pickle.dump(options, open('%s.pkl' % saveto, 'wb'), -1) # evaluate on test data. if global_step % test_freq == 0: scores = evaluate(model['f_prob'], test_loader) print 'eval scores: ', scores end_time = timeit.default_timer() print 'time used: %d seconds.' % (end_time - start_time) global_step += 1 scores = evaluate(model['f_prob'], test_loader) pprint.pprint(scores)
def rnn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rmsprop', t=5): # Get dictionary shapes K = [W1[0].shape[0], W2[0].shape[0]] # GPU cached data _M = theano.shared(M.T.astype(float32)) dum = Th.vector('dum') # We have weights to discover H = theano.shared( sqrt(2. / (K[0] + K[1] + M.shape[1])) * random.rand(M.T.shape[0], K[0] + K[1]).astype(float32)) fI = InputLayer(shape=(M.T.shape[0], K[0] + K[1]), input_var=H) # Split in two pathways fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, dum[0]) dfW2 = DropoutLayer(fW2, dum[0]) # Compute source modulators using previously learned dictionaries R1 = RecurrentLayer(dfW1, num_units=M.T.shape[1], b=None, W_in_to_hid=W1[0].astype(float32), W_hid_to_hid=W1[1].astype(float32), nonlinearity=lambda x: psoftplus(x, spb), gradient_steps=5) R2 = RecurrentLayer(dfW2, num_units=M.T.shape[1], b=None, W_in_to_hid=W2[0].astype(float32), W_hid_to_hid=W2[1].astype(float32), nonlinearity=lambda x: psoftplus(x, spb), gradient_steps=5) # Add the two approximations R = ElemwiseSumLayer([R1, R2]) # Cost function Ro = get_output(R) + eps cost = (_M*(Th.log(_M+eps) - Th.log( Ro+eps)) - _M + Ro).mean() \ + sp*Th.mean( abs( H)) + 0*Th.mean( dum) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([d]).astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = nget(R, dum, array([0]).astype(float32)).T + eps _r1 = nget(R1, dum, array([0]).astype(float32)).T _r2 = nget(R2, dum, array([0]).astype(float32)).T return _r, _r1, _r2, er