def nn_save(w, b, path="./", file_prefix=""): """Temporary function for saving neural network weights to disk """ layer_count = len(w) for i in range(layer_count): np.save(path + file_prefix + "L" + repr(i + 1) + "_w.npy", gnp.as_numpy_array(w[i])) np.save(path + file_prefix + "L" + repr(i + 1) + "_b.npy", gnp.as_numpy_array(b[i]))
def partition_function(self, batch_size, prec): """The exact value of Z calculated with precision prec. Only feasible for small number of hidden units.""" with decimal.localcontext() as ctx: if prec != 0: ctx.prec = prec batches = ml.common.util.pack_in_batches(all_states(self.n_hid), batch_size) if prec != 0: s = decimal.Decimal(0) else: allfhes = np.array([]) seen_samples = 0L total_samples = 2L**self.n_hid for hid in batches: print >>stderr, "%i / %i \r" % (seen_samples, total_samples), fhes = self.free_hidden_energy(hid) if prec != 0: for fhe in gp.as_numpy_array(fhes): p = decimal.Decimal(-fhe).exp() s += p else: allfhes = np.concatenate((allfhes, -gp.as_numpy_array(fhes))) seen_samples += hid.shape[0] if prec != 0: return s else: return logsum(allfhes)
def tuple_to_array(t): x = t[0] y = t[1] out = np.zeros((x.shape[0], x.shape[1], x.shape[2], 2)) out[:,:,:,0] = gp.as_numpy_array(x) out[:,:,:,1] = gp.as_numpy_array(y) return out
def plot_samples(samples, samples_force=None, twod=False, width=28, height=28): samples = gp.as_numpy_array(samples) samples = np.asarray(samples) if samples_force is not None: samples_force = gp.as_numpy_array(samples_force) samples_force = np.asarray(samples_force) if (not twod and samples.ndim == 1) or (twod and samples.ndim == 2): if twod: height = samples.shape[0] width = samples.shape[1] return _plot_one_sample(samples, samples_force, twod=twod, width=width, height=height) else: n_samples = samples.shape[0] if twod: height = samples.shape[1] width = samples.shape[2] out = np.zeros((height, width*n_samples, 3)) for s in range(n_samples): if samples_force is not None: o = _plot_one_sample(samples[s], samples_force[s], twod=twod, width=width, height=height) else: o = _plot_one_sample(samples[s], None, twod=twod, width=width, height=height) out[:, s*width : (s+1)*width, :] = o return out
def output_errors(self, n_plot=100000, alpha=0.05, plot_only_guilty_samples=True): # output statistics acc_low, acc_high, acc_mle = \ self.generator_accuracy_interval(alpha=alpha) print "Error probability: %g [%g, %g]" % (1-acc_mle, 1-acc_high, 1-acc_low) fe_low, fe_high = self.fe_interval(alpha=alpha) print "Free energy: [%g, %g]" % (fe_low, fe_high) if self.tmpl_X is not None: # collect incorrectly classified samples tmpl_X = gp.as_numpy_array(self.tmpl_X) gen_X = gp.as_numpy_array(self.gen_X) if plot_only_guilty_samples: s = self.guilty_samples else: s = self.incorrect_samples err_tmpl_X = tmpl_X[s,:] err_gen_X = gen_X[s,:] err_tmpl_Z = self.tmpl_Z[s] err_gen_Z = np.asarray(self.gen_Z[s], dtype='uint8') # output print "Misclassified samples:" print "True labels: ", err_tmpl_Z[0:n_plot] print "Generated labels: ", err_gen_Z[0:n_plot] if err_tmpl_X.shape[0] > 0: myplt = np.concatenate((common.util.plot_samples(err_tmpl_X[0:n_plot]), ml.common.util.plot_samples(err_gen_X[0:n_plot]))) plt.imshow(myplt, interpolation='none')
def free_energies_during_gibbs_sampling(self, x, kmax, beta=1): fes = [] fes.append(gp.as_numpy_array(self.free_energy(x))) for k in range(kmax): x, _ = self.gibbs_sample(x, 1, beta=beta) fes.append(gp.as_numpy_array(self.free_energy(x))) fes=np.asarray(fes) return fes
def dbn_save(ws_vh, ws_v, ws_h, path="./", file_prefix=""): """Temporary function for saving dbn weights from disk """ layer_count = len(ws_vh) for i in range(layer_count): np.save(path + file_prefix + "L" + repr(i + 1) + "_w_vh.npy", gnp.as_numpy_array(ws_vh[i])) np.save(path + file_prefix + "L" + repr(i + 1) + "_w_v.npy", gnp.as_numpy_array(ws_v[i])) np.save(path + file_prefix + "L" + repr(i + 1) + "_w_h.npy", gnp.as_numpy_array(ws_h[i]))
def generation_accuracy(label, ref_predict, myrbm, tmpl_X, tmpl_Z, tmpl_ref_Z, gen_X, gen_Z=None, output_data_line=True, store_samples=True): tmpl_Z = gp.as_numpy_array(tmpl_Z) n_samples = gen_X.shape[0] if n_samples < tmpl_X.shape[0]: print "Warning: less generated samples than template samples were provided" tmpl_X = tmpl_X[0:n_samples,:] tmpl_Z = tmpl_Z[0:n_samples] tmpl_ref_Z = tmpl_ref_Z[0:n_samples] # calculate accuracy of reference classifier diff = tmpl_Z - tmpl_ref_Z errs = np.count_nonzero(diff) corr = n_samples - errs svc_acc = corr / n_samples # classify generated data if gen_Z is None: gen_Z = ml.common.util.map(gp.as_numpy_array(gen_X), 100, ref_predict, caption="Classifying results with reference predictor") # count correctly classified samples diff = tmpl_Z - gen_Z errs = np.count_nonzero(diff) corr = n_samples - errs # find incorrect samples incorrect_samples = np.nonzero(diff)[0] guilty_samples = \ incorrect_samples[tmpl_ref_Z[incorrect_samples] == tmpl_Z[incorrect_samples]] # calculate free energy fes = ml.common.util.map(gen_X, 1000, myrbm.free_energy, caption="Calculating free energy") fes = gp.as_numpy_array(fes) fe_mean = np.mean(fes) fe_variance = ml.common.stats.unbiased_varince(fes) # create stability object s = Stability(label, n_samples, corr, svc_acc, fe_mean, fe_variance, incorrect_samples, guilty_samples) if store_samples: s.tmpl_X = tmpl_X s.tmpl_Z = tmpl_Z s.tmpl_ref_Z = tmpl_ref_Z s.gen_X = gen_X s.gen_Z = gen_Z # output performance data in table format if output_data_line: s.output_data_line() return s
def or_rest_fast(z, x): z = gp.as_numpy_array(z) x = gp.as_numpy_array(x) y = np.zeros(z.shape) ym = np.ones(z.shape) ym[(z == 1) & (x == 1)] = 0 y[(z == 1) & (x == 0)] = 1 return gp.as_garray(y), gp.as_garray(ym)
def save_parameters(rbm, epoch_or_filename): if type(epoch_or_filename) == str: filename = epoch_or_filename else: filename = "weights-%02i.npz" % epoch_or_filename np.savez_compressed( filename, weights=gp.as_numpy_array(rbm.weights), bias_vis=gp.as_numpy_array(rbm.bias_vis), bias_hid=gp.as_numpy_array(rbm.bias_hid), )
def nn_forward_pass(x, w, b, return_all=True): """ Forward pass for multilayer feed-forward sigmoid neural network Hidden units have sigmoid non-linearity. Output is soft-max. x: DxN matrix of input data w: Weights. List of weight matrices for each layer. b: Biases. List of bias vectors for each layer return_all: If True, returns hidden unit activations for each layer. If False just returns the output layer activations Returns a list h where each element is a matrix containing the activations for that layer. h[0] is input data x. """ # ---- TEMP HACK -------------- # I should find a more seamless way of running in mixed (some operations # with numpy, some with gnumpy) mode. # I had to resort to this, because i needed the validation classification # step in nn_train to run on CPU with numpy. GPU ran out of memory. if isinstance(x, gnp.garray): use_gpu = True else: use_gpu = False layer_count = len(w) if return_all: hs = [x] # unit activations for each layer h = x # all layers except the output layer for l in range(layer_count - 1): if use_gpu: a = gnp.dot(w[l].T, h) + b[l] h = gnp.logistic(a) else: a = np.dot(gnp.as_numpy_array(w[l]).T, h) + gnp.as_numpy_array(b[l]) h = 1.0 / (1 + np.exp(-a)) if return_all: hs.append(h) # output layer if use_gpu: h = gnp.dot(w[-1].T, h) + b[-1] h = gnp.exp(h) / gnp.sum(gnp.exp(h), axis=0) # soft-max else: h = np.dot(gnp.as_numpy_array(w[-1]).T, h) + gnp.as_numpy_array(b[-1]) h = np.exp(h) / np.sum(np.exp(h), axis=0) # soft-max if return_all: hs.append(h) return hs else: return h
def generate_or_dataset(X, Z, samples): X = gp.as_numpy_array(X) Z = gp.as_numpy_array(Z) si = np.random.randint(0, X.shape[0], size=(samples, 2)) x = X[si[:, 0], :] y = X[si[:, 1], :] O = or_sample(x, y) OZ = np.zeros((samples, 2)) OZ[:, 0] = Z[si[:, 0]] OZ[:, 1] = Z[si[:, 1]] return O, OZ
def apply_nn_test(P, net, nCxt, outLayer, feat_dir, FeatList, outFeatDir, useDropout): "Sends the test features for feedforward, and applies the PCA calculated from training files" fdir = ''; inFeatList = open(feat_dir + FeatList).readlines() for fname in inFeatList: if fname == '\n': continue elif fname.rstrip()[-1] == ':': fdir = fname.rstrip()[:-1]+'/' print fdir continue elif fname.rstrip()[-3:]=='txt': utt = np.loadtxt(feat_dir + fdir + fname[:-1]) # if not useDropout: outputs = gpu.as_numpy_array(net.fprop_xf(utt, outLayer)) # else: # outputs = gpu.as_numpy_array(net.fpropDropout(utt, outLayer)) assert(outputs.shape[1] == 40) outputs = np.dot(outputs, P) # if i/1*1 == i: # gpu.free_reuse_cache() outfile=htkmfc.HTKFeat_write(feat_dir + outFeatDir + 'test_feat/' + fdir[-9:] + fname[:-5], outputs.shape[1], htkmfc.USER) outfile.writeall(outputs) del outfile del outputs gpu.free_reuse_cache()
def forward(self, X): self.X = X # Num of examples N = X.shape[0] # Timespan T = X.shape[1] # Windows size S = self.windowSize # Channels D = self.numChannels # Num filters F = self.numFilters Z = np.zeros((N, T - S + 1, S, D), X.dtype) for i in range(T - S + 1): Z[:, i, :, :] = X[:, i:i + S, :] Z = Z.reshape(N * (T - S + 1), S * D) if self.gpu: Z = gpu.as_garray(Z.astype('float32')) Y = gpu.dot(Z, self.W) Y = gpu.as_numpy_array(Y) else: Y = np.dot(Z, self.W) Y = Y.reshape(N, T - S + 1, F) self.Z = Z return Y
def extract_patches(self, X, data_shape): """ Extract patches from input data according to its shape and the kernel configurations. Return patches matrix of size (H*W*N)x(C*ksize*ksize) """ X = gnp.as_numpy_array(X).reshape(-1, data_shape.c, data_shape.h, data_shape.w) out_shape = self.compute_output_shape(data_shape) padded_h = (out_shape.h - 1) * self.stride + self.ksize padded_w = (out_shape.w - 1) * self.stride + self.ksize if padded_h > data_shape.h or padded_w > data_shape.w: new_X = np.zeros((X.shape[0], X.shape[1], padded_h, padded_w), dtype=X.dtype) new_X[:, :, :data_shape.h, :data_shape.w] = X X = new_X assert data_shape.c == self.n_ic patches = [] for i in xrange(0, X.shape[-2] - self.ksize + 1, self.stride): for j in xrange(0, X.shape[-1] - self.ksize + 1, self.stride): patches.append(X[:, :, i:i + self.ksize, j:j + self.ksize]) return np.concatenate(patches, axis=0).reshape( -1, self.ksize * self.ksize * self.n_ic)
def recover_input(self, Y, out_shape, in_shape, **kwargs): """ Return recovered input and input_shape """ Y = gnp.as_numpy_array(Y).reshape(-1, out_shape.c, out_shape.h, out_shape.w).transpose((0,2,3,1)).reshape(-1, out_shape.c) P = self.recover_patches_from_responses(Y, **kwargs) return self.overlay_patches(P, out_shape, in_shape)
def extract_patches(self, X, data_shape): """ Extract patches from input data according to its shape and the kernel configurations. Return patches matrix of size (H*W*N)x(C*ksize*ksize) """ X = gnp.as_numpy_array(X).reshape(-1, data_shape.c, data_shape.h, data_shape.w) out_shape = self.compute_output_shape(data_shape) padded_h = (out_shape.h - 1) * self.stride + self.ksize padded_w = (out_shape.w - 1) * self.stride + self.ksize if padded_h > data_shape.h or padded_w > data_shape.w: new_X = np.zeros((X.shape[0], X.shape[1], padded_h, padded_w), dtype=X.dtype) new_X[:,:,:data_shape.h, :data_shape.w] = X X = new_X assert data_shape.c == self.n_ic patches = [] for i in xrange(0, X.shape[-2] - self.ksize + 1, self.stride): for j in xrange(0, X.shape[-1] - self.ksize + 1, self.stride): patches.append(X[:,:,i:i+self.ksize, j:j+self.ksize]) return np.concatenate(patches, axis=0).reshape(-1, self.ksize*self.ksize*self.n_ic)
def get_random_patches(X, in_shape, ksize, n_patches_per_image, batch_size=100, pad_h=0, pad_w=0): """ Extract random patches from images X. X: Nx(C*H*W) matrix, each row is an image in_shape: shape information for each input image ksize: size of the patches n_patches_per_image: number of patches per image batch_size: size of a batch. In each batch the patch locations will be the same. Return (n_patches_per_image*N)x(C*ksize*ksize) matrix, each row is one patch. """ X = gnp.as_numpy_array(X).reshape(-1, in_shape.c, in_shape.h, in_shape.w) if pad_h > 0 or pad_w > 0: new_X = np.zeros((X.shape[0], in_shape.c, in_shape.h + pad_h, in_shape.w + pad_w), dtype=X.dtype) new_X[:,:,:in_shape.h,:in_shape.w] = X X = new_X patches = [] for n in xrange(n_patches_per_image): for im_idx in xrange(0, X.shape[0], batch_size): h_start = np.random.randint(X.shape[-2] - ksize + 1) w_start = np.random.randint(X.shape[-1] - ksize + 1) patches.append(X[im_idx:im_idx+batch_size,:,h_start:h_start+ksize,w_start:w_start+ksize]) return np.concatenate(patches, axis=0).reshape(-1, in_shape.c*ksize*ksize)
def backward(self, dEdY): N = dEdY.shape[0] S = self.windowSize T = dEdY.shape[1] + S - 1 F = dEdY.shape[2] D = self.X.shape[2] dEdY = dEdY.reshape(N * (T - S + 1), F) dEdX = np.zeros(self.X.shape, self.X.dtype) if self.gpu: gdEdY = gpu.as_garray(dEdY.astype('float32')) self.dEdW = gpu.dot(self.Z.transpose(), gdEdY) else: self.dEdW = np.dot(self.Z.transpose(), dEdY) if self.outputdEdX: if self.gpu: gdEdZ = gpu.dot(gdEdY, self.W.transpose()) dEdZ = gpu.as_numpy_array(gdEdZ) else: dEdZ = np.dot(dEdY, self.W.transpose()) dEdZ = dEdZ.reshape(N, T - S + 1, S, D) for t in range(0, T): if t <= S - 1: dEdX[:, t, :] = np.sum(dEdZ[:, range(0, t + 1), range(t, -1, -1), :], axis=1) elif t >= T - S + 1: dEdX[:, t, :] = np.sum(dEdZ[:, range(t - S + 1, T - S + 1), range(S - 1, S - (T - t) - 1, -1), :], axis=1) else: dEdX[:, t, :] = np.sum(dEdZ[:, range(t - S + 1, t + 1), range(S - 1, -1, -1), :], axis=1) return dEdX
def forward(self, X): self.X = X # Num of examples N = X.shape[0] # Timespan T = X.shape[1] # Windows size S = self.windowSize # Channels D = self.numChannels # Num filters F = self.numFilters Z = np.zeros((N, T - S + 1, S, D), X.dtype) for i in range(T - S + 1): Z[:, i, :, :] = X[:, i : i + S, :] Z = Z.reshape(N * (T - S + 1), S * D) if self.gpu: Z = gpu.as_garray(Z.astype('float32')) Y = gpu.dot(Z, self.W) Y = gpu.as_numpy_array(Y) else: Y = np.dot(Z, self.W) Y = Y.reshape(N, T - S + 1, F) self.Z = Z return Y
def or_rest(z, x): z = gp.as_numpy_array(z) x = gp.as_numpy_array(x) y = np.zeros(z.shape) ym = np.ones(z.shape) ym[(z == 1) & (x == 1)] = 0 y[(z == 1) & (x == 0)] = 1 # If pixels that are needed to explain the picture are forced on # this results in pixels that cannot be turned off by the ml.rbm. ym[(z == 1) & (x == 0)] = 0 # turn off whole force: # ym = ym * 0 return gp.as_garray(y), gp.as_garray(ym)
def backprop(self, dLdY, return_on_gpu=False): """Perform backprop through this layer. """ # Backprop is just multiplication by the mask from feedforward dLdX = gp.garray(dLdY) * self.dYdX if not return_on_gpu: dLdX = gp.as_numpy_array(dLdX).astype(np.float32) return dLdX
def constrainMaxNorm(self): if self.max_norm == -1: return for i in range(len(self.weights)): wf = gnp.as_numpy_array(self.weights[i]).flatten() if l2norm(wf) > self.max_norm: wf = (wf/l2norm(wf)) * self.max_norm self.weights[i] = gnp.garray(wf.reshape(self.weights[i].shape))
def check_performance(svc): X, TX, y, Ty = ml.rbm.util.load_mnist(False) X = gp.as_numpy_array(X) y = gp.as_numpy_array(y) TX = gp.as_numpy_array(TX) Ty = gp.as_numpy_array(Ty) print "Checking performance..." nt = 10000 Py = svc.predict(X[0:nt]) training_err = ml.common.util.classification_error(Py, y[0:nt]) PTy = svc.predict(TX) test_err = ml.common.util.classification_error(PTy, Ty) print "Prediction error on first %d training samples: %g" % (nt, training_err) print "Prediction error on test set: %g" % test_err return svc
def train(self, x): if self.prev: x = self.prev.process(x) x = gnp.as_garray(x) self.avg = x.mean(axis=0) cov = (x - self.avg).T.dot(x - self.avg) / x.shape[0] cov = gnp.as_numpy_array(cov) self.sqrcov = la.cholesky(cov + np.eye(cov.shape[0]) * 1e-5) self.m = gnp.as_garray(la.inv(self.sqrcov + np.eye(x.shape[1]) * 1e-5))
def generate_or_dataset_with_shift(S, SZ, ref_SZ, x_shift, y_shift, n_samples, sample_indices=None): S = gp.as_numpy_array(S) SZ = gp.as_numpy_array(SZ) if sample_indices is not None: si = sample_indices else: si = generate_sample_indices_for_or_dataset(S, n_samples) X = S[si[:, 0]] XZ = SZ[si[:, 0]] ref_XZ = ref_SZ[si[:, 0]] Y = S[si[:, 1]] YZ = SZ[si[:, 1]] ref_YZ = ref_SZ[si[:, 1]] O = or_sample_with_shift(X, Y, x_shift, y_shift) return X, XZ, ref_XZ, Y, YZ, ref_YZ, O
def recover_input(self, Y, out_shape, in_shape, **kwargs): """ Return recovered input and input_shape """ Y = gnp.as_numpy_array(Y).reshape(-1, out_shape.c, out_shape.h, out_shape.w).transpose( (0, 2, 3, 1)).reshape(-1, out_shape.c) P = self.recover_patches_from_responses(Y, **kwargs) return self.overlay_patches(P, out_shape, in_shape)
def or_rest(z, x): z = gp.as_numpy_array(z) x = gp.as_numpy_array(x) y = np.zeros(z.shape) ym = np.ones(z.shape) ym[(z == 1) & (x == 1)] = 0 y[(z == 1) & (x == 0)] = 1 # "no on force": # If pixels that are needed to explain the picture are forced on # this results in pixels that cannot be turned off by the ml.rbm. ym[(z == 1) & (x == 0)] = 0 # "no force": #ym = ym * 0 # best is to have "no force" off and "no on force" on return gp.as_garray(y), gp.as_garray(ym)
def train(): X, TX, y, Ty = ml.rbm.util.load_mnist(False) X = gp.as_numpy_array(X) y = gp.as_numpy_array(y) TX = gp.as_numpy_array(TX) Ty = gp.as_numpy_array(Ty) #X = X[0:3000, ...] #y = y[0:3000, ...] print "Fitting SVM..." svc = svm.SVC(kernel='rbf', verbose=True) svc.fit(X, y) filename = "mnist_svm.dat" print "Writing model to %s" % filename with gzip.open(filename, 'wb') as file: pickle.dump(svc, file, pickle.HIGHEST_PROTOCOL) return svc
def array(x, dtype=None, **kwargs): if gnp.is_garray(x): if dtype is gpu_float32: return x else: return np.array(gnp.as_numpy_array(x), dtype=dtype, **kwargs) else: if dtype is gpu_float32: return gnp.as_garray(np.array(x, **kwargs)) else: return np.array(x, dtype=dtype, **kwargs)
def or_performance(myrbm, svc, OX, OZ, iters, gibbs_steps, beta): OZ = gp.as_numpy_array(OZ) batch_size = 1000 errs = 0 for i in range(int(math.ceil(OX.shape[0] / float(batch_size)))): ox = OX[i*batch_size : (i+1)*batch_size, :] oz = OZ[i*batch_size : (i+1)*batch_size, :] x1, x2 = ml.rbm.orrbm.or_infer(myrbm, ox, iters, gibbs_steps, beta=beta) y1 = svc.predict(gp.as_numpy_array(x1)) y2 = svc.predict(gp.as_numpy_array(x2)) z1 = oz[:, 0] z2 = oz[:, 1] diff = (z1 - y1)**2 + (z2 - y2)**2 errs += np.count_nonzero(diff) err_prob = errs / float(OX.shape[0]) return err_prob
def train_model(): m = build_model() stop = climin.stops.any_([ climin.stops.after_n_iterations(max_iter), ]) pause = climin.stops.modulo_n_iterations(n_report) weight_decay = ((m.parameters.hidden_to_out ** 2).sum()) # + (m.parameters.hidden_to_hidden_0**2).sum() # + (m.parameters.hidden_to_out**2).sum()) weight_decay /= m.exprs['inpt'].shape[0] m.exprs['true_loss'] = m.exprs['loss'] m.exprs['loss'] = m.exprs['loss'] + c_wd * weight_decay f_wd = m.function(['inpt'], c_wd * weight_decay) n_wrong = 1 - T.eq(T.argmax(m.exprs['output'], axis=1), T.argmax(m.exprs['target'], axis=1)).mean() f_n_wrong = m.function(['inpt', 'target'], n_wrong) losses = [] v_losses = [] print 'max iter', max_iter start = time.time() # Set up a nice printout. keys = '#', 'loss', 'val loss', 'seconds', 'wd', 'train emp', 'test emp' max_len = max(len(i) for i in keys) header = '\t'.join(i for i in keys) print header print '-' * len(header) f_loss = m.function(['inpt', 'target'], ['true_loss', 'loss']) for i, info in enumerate(m.powerfit((X, Z), (TX, TZ), stop, pause)): if info['n_iter'] % n_report != 0: continue passed = time.time() - start losses.append(info['loss']) v_losses.append(info['val_loss']) #img = tile_raster_images(fe.parameters['in_to_hidden'].T, image_dims, feature_dims, (1, 1)) #save_and_display(img, 'filters-%i.png' % i) info.update({ 'time': passed, 'l2-loss': scalar(f_wd(X)), 'train_emp': scalar(f_n_wrong(X, Z)), 'test_emp': scalar(f_n_wrong(TX, TZ)), }) row = '%(n_iter)i\t%(loss)g\t%(val_loss)g\t%(time)g\t%(l2-loss)g\t%(train_emp)g\t%(test_emp)g' % info print row np.savez_compressed(savepath, parameters=gp.as_numpy_array(m.parameters.data[...]))
def mixing_quality(samples): samples = gp.as_numpy_array(samples) n_steps = samples.shape[0] avg_dists = [] for step in range(n_steps-1): v_now = samples[step, :, :] v_next = samples[step+1, :, :] dists = np.sqrt(np.sum(np.power(v_next - v_now, 2), axis=1)) avg_dists.append(np.mean(dists)) return np.mean(avg_dists)
def gradDebug(self, inputBatch, targetBatch): inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) targetBatch = targetBatch if isinstance(targetBatch, gnp.garray) else gnp.garray(targetBatch) mbsz = inputBatch.shape[0] outputActs = self.fprop(inputBatch) outputErrSignal = -self.outputActFunct.dErrordNetInput(targetBatch, self.state[-1], outputActs) errSignals = self.bprop(outputErrSignal) for i, (WGrad, biasGrad) in enumerate(self.gradients(self.state, errSignals)): self.WGrads[i] = WGrad self.biasGrads[i] = biasGrad allWeightGrads = itertools.chain(self.WGrads, self.biasGrads) return gnp.as_numpy_array(gnp.concatenate([dw.ravel() for dw in allWeightGrads]))
def gradDebug(self, inputBatch, targetBatch): inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) targetBatch = targetBatch if isinstance(targetBatch, gnp.garray) else gnp.garray(targetBatch) mbsz = inputBatch.shape[0] outputActs = self.fprop(inputBatch) outputErrSignal = -self.outputActFunct.dErrordNetInput(targetBatch, self.state[-1], outputActs) # error = self.outputActFunct.error(targetBatch, self.state[-1], outputActs) errSignals = self.bprop(outputErrSignal) for i, (WGrad, biasGrad) in enumerate(self.gradients(self.state, errSignals)): # update the weight increments self.WGrads[i] = WGrad self.biasGrads[i] = biasGrad allWeightGrads = itertools.chain(self.WGrads, self.biasGrads) return gnp.as_numpy_array(gnp.concatenate([dw.ravel() for dw in allWeightGrads]))
def backward(self, dEdY): dEdZ = self.activeFn.backward(dEdY, self.Y, 0) if self.gpu: gdEdZ = gpu.as_garray(dEdZ.astype('float32')) self.dEdW = gpu.dot(self.X.transpose(), gdEdZ) if self.bias: dEdX = gpu.dot(gdEdZ, self.W[:-1, :].transpose()) else: dEdX = gpu.dot(gdEdZ, self.W.transpose()) dEdX = gpu.as_numpy_array(dEdX) else: self.dEdW = np.dot(self.X.transpose(), dEdZ) if self.bias: dEdX = np.dot(dEdZ, self.W[:-1, :].transpose()) else: dEdX = np.dot(dEdZ, self.W.transpose()) return dEdX if self.outputdEdX else None
def write(self, dat): """ add dat to buffer """ dat=gp.as_numpy_array(dat) end=self.index+dat.shape[0] if end<=self.maxrows: if self.data==None: self.data=np.empty((self.maxrows, dat.shape[1])) self.data[self.index:end]=dat self.index=end elif self.index==self.maxrows: self.flush() self.index=0 self.write(dat) else: raise Exception("disk write buffer is not algined with batchsize")
def write(self, dat): """ add dat to buffer """ dat = gp.as_numpy_array(dat) end = self.index + dat.shape[0] if end <= self.maxrows: if self.data == None: self.data = np.empty((self.maxrows, dat.shape[1])) self.data[self.index:end] = dat self.index = end elif self.index == self.maxrows: self.flush() self.index = 0 self.write(dat) else: raise Exception("disk write buffer is not algined with batchsize")
def apply_nn_train_prePCA(net, nCxt, outLayer, feat_dir, FeatList, outFeatDir, Nframes, useDropout): """Sends the training features for feedforward and collects the output in a matrix X for performing PCA""" fdir = ''; dim = net.weights[-2].shape[1] X = np.zeros((Nframes,dim)) inFeatList = open(feat_dir + FeatList).readlines() fro = 0; to = 0; for fname in inFeatList: if fname.rstrip()[-1] == ':': fdir = fname.rstrip()[:-1]+'/' continue elif fname.rstrip()[-3:]=='txt': utt = np.loadtxt(feat_dir + fdir + fname.rstrip()) # if not useDropout: outputs = gpu.as_numpy_array(net.fprop_xf(utt, outLayer)) # else: # outputs = gpu.as_numpy_array(net.fpropDropout(utt, outLayer)) assert(outputs.shape[1] == 40) fro = to to = fro + outputs.shape[0] # if X == None: # X = outputs # else: X[fro:to] = outputs # X = np.concatenate((X,outputs)) # if i/1*1 == i: # gpu.free_reuse_cache() # np.savetxt(feat_dir + outFeatDir + 'train_16k_prePCA/' + fname, gpu.as_numpy_array(outputs)) np.save(feat_dir + outFeatDir + 'train_prePCA/' + fname[:-5], outputs) del outputs gpu.free_reuse_cache() #End of for return X
def get_random_patches(X, in_shape, ksize, n_patches_per_image, batch_size=100, pad_h=0, pad_w=0): """ Extract random patches from images X. X: Nx(C*H*W) matrix, each row is an image in_shape: shape information for each input image ksize: size of the patches n_patches_per_image: number of patches per image batch_size: size of a batch. In each batch the patch locations will be the same. Return (n_patches_per_image*N)x(C*ksize*ksize) matrix, each row is one patch. """ X = gnp.as_numpy_array(X).reshape(-1, in_shape.c, in_shape.h, in_shape.w) if pad_h > 0 or pad_w > 0: new_X = np.zeros( (X.shape[0], in_shape.c, in_shape.h + pad_h, in_shape.w + pad_w), dtype=X.dtype) new_X[:, :, :in_shape.h, :in_shape.w] = X X = new_X patches = [] for n in xrange(n_patches_per_image): for im_idx in xrange(0, X.shape[0], batch_size): h_start = np.random.randint(X.shape[-2] - ksize + 1) w_start = np.random.randint(X.shape[-1] - ksize + 1) patches.append(X[im_idx:im_idx + batch_size, :, h_start:h_start + ksize, w_start:w_start + ksize]) return np.concatenate(patches, axis=0).reshape(-1, in_shape.c * ksize * ksize)
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * np.nansum( gp.as_numpy_array(labelMat * gp.log(probs))) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i + 1], True) self.deltas[i] = w.T.dot(self.deltas[i + 1]) * grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) return cost, self.grad
def getWeights(self): if self.gpu: return gpu.as_numpy_array(self.W) else: return self.W
def costAndGrad(self, data, labels, key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim] + self.layerSizes + [self.outputDim] stackMax = len(self.stack) - 1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s, T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax + 1): w, b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i - 1]) + b # loop over time for recurrent layer if (self.temporalLayer - 1) == l: for t in range(T): if t > 0: self.hActs[i][:, t] += self.stack[-1][0].dot( self.hActs[i][:, t - 1]) # nonlinearity if i <= stackMax: self.hActs[i][:, t] = self.activation(self.hActs[i][:, t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs / np.sum(probs, axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs, delta_output)) if not self.train: return cost, None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)] for w, b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:, t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1, 1).dot( self.hActs[-2][:, t].reshape(-1, 1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes) - 1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i + 1][:, t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1, 1).dot( self.hActs[i][:, t].T.reshape(1, -1)) self.grad[i][1] += delta.reshape(-1, 1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1, 1).dot( self.hActs[i + 1][:, t - 1].T.reshape(1, -1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2, 1)) # push the delta downward w, b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost, self.grad, skip
def decode(self, z, in_shape, **kwargs): r = gnp.as_numpy_array(self.ae.decoder.forward_prop(z)) out_shape = self.convnet.compute_output_shape(in_shape) assert out_shape.size() == r.shape[1] return self.convnet.recover_input(r, in_shape, **kwargs)
def gather(x): """Copys array from GPU if running on GPU""" if GPU: return gnumpy.as_numpy_array(x) else: return x
def costAndGrad(self,data,labels=None,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop # this is the same as minibatch forward prop # since we pre-compute context window features for each time self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) # probs[probs<1e-12] = 1e-12 # TODO have to clamp? ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? if not self.train: return ctc.decode_best_path(probs, ref=labels, blank=0) #return ctc.decode_bp_bigrams(probs, blank=0, B=None) cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0) # Bad utterance ? if skip: return cost,self.grad,skip # Store probabilities and error signal for a given key #if key is not None and key in self.hist: # self.hist[key].append((probs,self.deltas[-1])) self.deltas[-1] = gp.garray(self.deltas[-1]) # back prop i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients # NOTE we do not divide by utterance length. # Will need to scale up weight norm penalty accordingly for i in range(len(self.grad)): self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1) return cost,self.grad,skip