def forward(self): """ Perform a forward step - activate the net input using logistic function """ # Perform the activation (logistic function) self.output.setOutput((1.0 - gpu.exp(-self.input.getNetInput())) / (1.0 + gpu.exp(-self.input.getNetInput())))
def nn_forward_pass(x, w, b, return_all=True): """ Forward pass for multilayer feed-forward sigmoid neural network Hidden units have sigmoid non-linearity. Output is soft-max. x: DxN matrix of input data w: Weights. List of weight matrices for each layer. b: Biases. List of bias vectors for each layer return_all: If True, returns hidden unit activations for each layer. If False just returns the output layer activations Returns a list h where each element is a matrix containing the activations for that layer. h[0] is input data x. """ # ---- TEMP HACK -------------- # I should find a more seamless way of running in mixed (some operations # with numpy, some with gnumpy) mode. # I had to resort to this, because i needed the validation classification # step in nn_train to run on CPU with numpy. GPU ran out of memory. if isinstance(x, gnp.garray): use_gpu = True else: use_gpu = False layer_count = len(w) if return_all: hs = [x] # unit activations for each layer h = x # all layers except the output layer for l in range(layer_count - 1): if use_gpu: a = gnp.dot(w[l].T, h) + b[l] h = gnp.logistic(a) else: a = np.dot(gnp.as_numpy_array(w[l]).T, h) + gnp.as_numpy_array(b[l]) h = 1.0 / (1 + np.exp(-a)) if return_all: hs.append(h) # output layer if use_gpu: h = gnp.dot(w[-1].T, h) + b[-1] h = gnp.exp(h) / gnp.sum(gnp.exp(h), axis=0) # soft-max else: h = np.dot(gnp.as_numpy_array(w[-1]).T, h) + gnp.as_numpy_array(b[-1]) h = np.exp(h) / np.sum(np.exp(h), axis=0) # soft-max if return_all: hs.append(h) return hs else: return h
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def rect_log(x, computeGrad = False): if (not computeGrad): f = gp.log(x*(x>0)+1)* (x>0) return f g = (x>0) / (gp.exp(x)) return g
def softmax(self, x): max=gp.max(x,axis=1) x=x-max[:,gp.newaxis] y=gp.exp(x) s=gp.sum(y,1) z=y/s[:,gp.newaxis] return z
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') # flatten(), default in row-major order, order='F' means Fortran(column-major) order tmp = tmp.reshape((batchsize, self.K * self.context)) # reshape(), in row-major order words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) # the dot() of 2-D matrix is equiverlent to multiply acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def loss_mclr(Yh, Y): """Compute mutinomial logistic regression loss for Yh, w.r.t. Y. Values in Yh should probably be network outputs, and each row in Y must be a +1/-1 indicator vector for the target class of a row in Yh. """ obs_count = float(Y.shape[0]) # Get boolean mask for each observation's target class cl_mask = (Y > 0.0) # Compute softmax distribution tranform of Yh sm_sum = gp.sum(gp.exp(Yh), axis=1) P = gp.exp(Yh) / sm_sum[:,gp.newaxis] dL = (P - cl_mask) / obs_count logP = gp.log(P) * cl_mask L = -gp.sum(logP) / obs_count return {'L': L, 'dL': dL}
def getErrorLoss(self, a0, a2,factor=1.0): """ error is measured by neg log likelihood """ pow=a2**a0 p=gp.exp(-a2)*pow/self.factor[a0] l=gp.log(p) return -l.sum(axis=1).mean()*factor
def forward(self): """ Perform a forward step - activate the net input using logistic function """ # Perform the activation self.output.setOutput(gpu.exp(self.input.getNetInput())) self.output.setOutput(self.output.getOutput() / (gpu.garray([gpu.sum(self.output.getOutput(),1)]).transpose()))
def compute_kernel_transformation(self, x_base, x_new): x_base = x_base if isinstance(x_base, gnp.garray) else gnp.garray(x_base) x_new = x_new if isinstance(x_new, gnp.garray) else gnp.garray(x_new) xx = x_new.dot(x_base.T) xx_base = (x_base**2).sum(axis=1) xx_new = (x_new**2).sum(axis=1) return gnp.exp(-1.0 / (2 * self.sigma**2) * (-2 * xx + xx_base + xx_new[:,gnp.newaxis]))
def safe_softmax(self, Y): """Compute a reasonably (numerically) safe softmax.""" Y_max = gp.max(Y, axis=1) Y_max = Y_max[:,gp.newaxis] Y_exp = gp.exp(Y - Y_max) Y_sum = gp.sum(Y_exp, axis=1) Y_sum = Y_sum[:,gp.newaxis] Y_sm = Y_exp / Y_sum return Y_sm
def log_exp_sum_1d(x): """ This computes log(exp(x_1) + exp(x_2) + ... + exp(x_n)) as x* + log(exp(x_1-x*) + exp(x_2-x*) + ... + exp(x_n-x*)), where x* is the max over all x_i. This can avoid numerical problems. """ x_max = x.max() if isinstance(x, gnp.garray): return x_max + gnp.log(gnp.exp(x - x_max).sum()) else: return x_max + np.log(np.exp(x - x_max).sum())
def forward(self, X, Im, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] Im = gpu.garray(Im) C = self.C M = self.M bw = self.bw J = self.J bj = self.bj Wfx = self.Wfx Whf = self.Whf Wfv = self.Wfv # Forwardprop images Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) IF = gpu.dot(Im, gpu.concatenate((J, bj))) IF = IF * (IF > 0) # Obtain word features R = gpu.dot(Wfx, Whf) tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') tmp = tmp.reshape((batchsize, self.K * self.context)) words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) acts = acts + gpu.dot(IF, M) # Multiplicative interaction F = gpu.dot(acts, Wfx) * gpu.dot(IF, Wfv) F = gpu.concatenate((F, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(F, gpu.concatenate((Whf, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, IF, F, preds.as_numpy_array())
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) probs += (probs < 1e-8) * (1e-8 - probs) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs)) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i + 1], True) self.deltas[i] = w.T.dot(self.deltas[i + 1]) * grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost, self.grad
def costAndGrad(self,data,labels): # forward prop self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.exp(probs) probs = probs/gp.sum(probs,axis=0) probs += (probs < 1e-8)*(1e-8-probs) labelMat = np.zeros(probs.shape) labelMat[labels,range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1./self.mbSize)*gp.sum(labelMat*gp.log(probs)) if not self.train: return cost,None # back prop self.deltas[-1] = probs-labelMat i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1./self.mbSize)*self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = (1./self.mbSize)*gp.sum(self.deltas[i],axis=1).reshape(-1,1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost,self.grad
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = (1 / 2.) * ( self.hActs[i] + gp.sign(self.hActs[i]) * self.hActs[i]) i += 1 probs = self.hActs[-1] + gp.min(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs)) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): self.deltas[i] = w.T.dot(self.deltas[i + 1]) * gp.sign( self.hActs[i + 1]) i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) return cost, self.grad
def costAndGrad(self,data,labels): # forward prop self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = (1/2.)*(self.hActs[i]+gp.sign(self.hActs[i])*self.hActs[i]) i += 1 probs = self.hActs[-1]+gp.min(self.hActs[-1],axis=0) probs = gp.exp(probs) probs = probs/gp.sum(probs,axis=0) labelMat = np.zeros(probs.shape) labelMat[labels,range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1./self.mbSize)*gp.sum(labelMat*gp.log(probs)) if not self.train: return cost,None # back prop self.deltas[-1] = probs-labelMat i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): self.deltas[i] = w.T.dot(self.deltas[i+1])*gp.sign(self.hActs[i+1]) i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1./self.mbSize)*self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = (1./self.mbSize)*gp.sum(self.deltas[i],axis=1).reshape(-1,1) return cost,self.grad
def metropolis_flip_sample(self, vis_start, iterations, beta=1, abeta=1): """Flips a randomly chosen bit and accepts the change if the resulting free energy is lower or with probability exp(-abeta*dE) where dE is the positive difference in energy. Repeats for given iterations.""" vis = vis_start.copy() fes = self.free_energy(vis) n_total_flips = 0 for i in range(iterations): # flip a bit at random f = np.random.randint(0, vis.shape[1]) vis_prop = vis.copy() vis_prop[:,f] = 1-vis[:,f] # calculate new free energy fes_prop = self.free_energy(vis_prop, beta=beta) fes_diff = fes_prop - fes # accept if it is lower or with negative exponential probability fes_smaller = fes_diff <= 0 acc_p = fes_smaller + (1-fes_smaller) * gp.exp(-(1-fes_smaller)*abeta*fes_diff) acc_rng = gp.rand(acc_p.shape) acc = acc_rng <= acc_p # statistics n_flips = gp.sum(acc) n_total_flips += n_flips # compose new state acc_t = gp.tile(acc, (vis.shape[1], 1)).T vis = acc_t * vis_prop + (1-acc_t) * vis fes = acc * fes_prop + (1-acc) * fes #print "Total number of flips: ", n_total_flips return vis
def log_exp_sum(x, axis=1): x_max = x.max(axis=axis) if isinstance(x, gnp.garray): return (x_max + gnp.log(gnp.exp(x - x_max[:,gnp.newaxis]).sum(axis=axis))).asarray() else: return x_max + np.log(np.exp(x - x_max[:,np.newaxis]).sum(axis=axis))
def forward(self): self.x = self.f(self.x) self.s = gpu.exp(gpu.dot(self.x, self.w) + self.b) self.s /= gpu.sum(self.s, 1).reshape(self.q, 1)
def exp(A): return gp.exp(A)
def sigmoid(x): den = 1.0 + gp.exp(-1.0 * x) d = 1.0 / den return d
def compute_kernel_matrix(self, x): x = x if isinstance(x, gnp.garray) else gnp.garray(x) xx = x.dot(x.T) x_diag = safe_diag(xx) return gnp.exp(-1.0 / (2 * self.sigma**2) * (-2 * xx + x_diag + x_diag[:,gnp.newaxis]))
def softmax_old(x): y = gp.max(x, axis=1)[:, gp.newaxis] logsumexp = y + gp.log(gp.sum((gp.exp(x - y)), axis=1))[:, gp.newaxis] return gp.exp(x - logsumexp)
def softmax1(A): Z = gp.exp(A) return Z / gp.sum(Z, axis=1)[:, gp.newaxis]
def softmax(x): return gnp.exp(x) / gnp.exp(x).sum()
def sigmoid(t): return 1. / (1. + gnp.exp(-t))
def base_p_vis(self, vis): "Probability of visible units in base rate RBM" punit = (gp.exp(gp.dot(self.base_bias_vis, vis)) / (1 + gp.exp(self.base_bias_vis))) return gp.prod(punit, axis=1)
def base_partition_function(self): "Computes the partition function of the base rate RBM" part_vis = gp.prod(1 + gp.exp(self.base_bias_vis)) part_hid = 2**self.rbm.n_hid return part_vis * part_hid
def activation_softmax(x): result = x - g.max(x, axis=1)[:, g.newaxis] result = g.exp(result) result = result / g.sum(result, axis=1)[:, g.newaxis] return result
def sigmoid(x): return 1. / (1 + gnp.exp(-x))
def softmax_grounded(b): z = gp.zeros((b.shape[0], 1)) b_ = gp.concatenate((z, b), axis=1) y_ = gp.exp(b_) return y_ / (y_.sum(1)[:, gp.newaxis])
def output(self, A, Z=None): # Note gnumpy does not have a expand_dims function amax = A.max(axis=1) Y = gnp.exp(A - amax.reshape(amax.size, 1)) ysum = Y.sum(axis=1) return Y / ysum.reshape(ysum.size, 1)
def softmax(A): A -= gp.max(A, axis=1)[:, gp.newaxis] Z = gp.exp(A) return Z / gp.sum(Z, axis=1)[:, gp.newaxis]
def d_exp_penalty(x, sigma): return ((2 * (1 / sigma) * x * gp.exp(-x**2 / sigma)))
def evaluate_unigram_partition(data, batch_size, num_steps, num_ensembles, eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'test_set_probs_no_alpha.out'): epoch_size = ((len(data) // batch_size) - 1) // num_steps start_time = time.time() costs = 0.0 iters = 0 full_probs = [] for i in range(num_ensembles): print(i) #full_probs[i] = np.loadtxt(fp + 'ensemble' + str(i+1) + '/test_set_probs_no_alpha.out', delimiter = ',') full_probs.append(np.asarray(pd.read_csv(fp + 'ensemble' + str(i+1) + '/' + probs_fn, delimiter = ',', header = None))) print(np.shape(full_probs[i])) # for ii in range(len(full_probs[i])): # full_probs[i][ii] = full_probs[i][ii] / np.sum(full_probs[i][ii]) print('reading in probs done') id_to_model = {} with open(fp + 'id_to_model.out', 'rb') as f: csv_reader = csv.reader(f, delimiter = ',', quotechar = '|') for row in csv_reader: row_list = [x for x in row if (x != '[' and x != ']' and x != '' and x != ' ')] #print(row_list) #row_list = row.split(',') row_list = [int(i) for i in row_list] id_to_model[row_list[0]] = row_list[1:len(row_list)] print('reading in id_to_model done') #print(id_to_model[1344]) #probs = tf.nn.softmax(probs) # print(np.sum(probs[0])) # print(np.sum(probs[50])) #print(len(probs[0])) next_is_start_of_sentence = True flaggg = True #sent_list = reader.get_sentence_list(data = data, eos_id = eos_id) for step, (x, y) in enumerate(reader.ptb_iterator(data, batch_size, num_steps)): if next_is_start_of_sentence: x = x[0,0] if x in id_to_model: models_included = id_to_model[x] coef = 1 else: models_included = [1,2,3,4,5,6,7,8,9] coef = 1 if x == eos_id: #cost = -1 * gpu.log(full_probs[0][step]) models_included = [1] coef = 1 next_is_start_of_sentence = True else: next_is_start_of_sentence = False #coef = 0.5 #models_included = id_to_model[x] probs = 0 denom = 0 for m in models_included: if m == 1: probs += full_probs[m-1][step] denom += 1 else: #coef = 0.5 probs += coef*full_probs[m-1][step] denom += coef probs = probs / float(denom) cost = -1 * gpu.log(probs) # print(step) # print(x) # print(y) # print(probs) #print(probs[0]) #cost = -1 * gpu.log(probs[step][0,y[0,0]]) #print(cost) ''' loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float64)]) print(loss) cost = tf.reduce_sum(loss) / batch_size print(cost) ''' costs += cost iters += num_steps if step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, gpu.exp(costs / iters), iters * batch_size / (time.time() - start_time))) return gpu.exp(costs / iters)
def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False): y = gnp.exp(pred - pred.max(axis=1)[:,gnp.newaxis]) y = y / y.sum(axis=1)[:,gnp.newaxis] return -(self.target * gnp.log(y + _SMALL_CONSTANT)).sum(), y - self.target
def sigmoid_prime(x): den = 1.0 + gp.exp(-1.0 * x) d = (gp.exp(-1.0 * x)) / den**2 return d
def output(self, A, Z=None): # Note gnumpy does not have a expand_dims function amax = A.max(axis=1) Y = gnp.exp(A - amax.reshape(amax.size,1)) ysum = Y.sum(axis=1) return Y / ysum.reshape(ysum.size,1)
def activation_softmax(x): result = x - g.max(x,axis=1)[:,g.newaxis] result = g.exp(result) result = result / g.sum(result,axis=1)[:,g.newaxis] return result
def exp_penalty(x, sigma): return x.shape[1] - ((gp.exp(-x**2 / sigma)).sum()) / x.shape[0]
def sigmoid(z): return 1 / (1 + gnp.exp(-z))
def tanh(x): return (gnp.exp(x) - gnp.exp(-x)) / (gnp.exp(x) + gnp.exp(-x))
def sigmoid(z): return 1 / (1 + gpu.exp(-z))