def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False): N, D = X.shape n_batches = int(N / batch_sz) W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M), 'bh_%s' % self.id) self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [self.W, self.bh] # TODO: technically these should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] X_in = T.matrix('X_%s' % self.id) X_hat = self.forward_output(X_in) # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D) cost_op = theano.function( inputs=[X_in], outputs=cost, ) updates = [ (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [ (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print("training autoencoder: %s" % self.id) for i in range(epochs): print("epoch:", i) X = shuffle(X) for j in range(n_batches): batch = X[j*batch_sz:(j*batch_sz + batch_sz)] train_op(batch) the_cost = cost_op(X) # technically we could also get the cost for Xtest here print("j / n_batches:", j, "/", n_batches, "cost:", the_cost) costs.append(the_cost) if show_fig: plt.plot(costs) plt.savefig('AE_costs.jpg')
def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False): N, D = X.shape n_batches = N / batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M), 'bh_%s' % self.id) self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [self.W, self.bh] # TODO: technically these should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] X_in = T.matrix('X_%s' % self.id) X_hat = self.forward_output(X_in) # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D) cost_op = theano.function( inputs=[X_in], outputs=cost, ) updates = [ (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [ (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print "training autoencoder: %s" % self.id for i in xrange(epochs): print "epoch:", i X = shuffle(X) for j in xrange(n_batches): batch = X[j*batch_sz:(j*batch_sz + batch_sz)] train_op(batch) the_cost = cost_op(X) # technically we could also get the cost for Xtest here print "j / n_batches:", j, "/", n_batches, "cost:", the_cost costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def __init__(self, hidden_layer_sizes, keep_probs): self.hidden_layer_sizes = hidden_layer_sizes self.keep_probs = keep_probs #list of all parameters except first and final layer self.all_params = [] m1 = self.hidden_layer_sizes[0] self.count = 1 for m2 in self.hidden_layer_sizes[1:]: #dont add bias due to batch_normalization w_init, _ = init_weights(m1, m2) W = tf.Variable(w_init, name='W' + str(self.count)) #batch normalization parameters gamma = tf.Variable(np.ones(m2, dtype=np.float32), name='Gamma' + str(self.count)) beta = tf.Variable(np.zeros(m2, dtype=np.float32), name='Beta' + str(self.count)) running_mean = tf.Variable(np.zeros(m2, dtype=np.float32), trainable=False, name='Rn_mean' + str(self.count)) running_var = tf.Variable(np.zeros(m2, dtype=np.float32), trainable=False, name='Rn_var' + str(self.count)) self.all_params += [{ 'W': W, 'gamma': gamma, 'beta': beta, 'rn_mean': running_mean, 'rn_var': running_var }] self.count += 1 m1 = m2
def fit_to_input(self, k, learning_rate=1.0, mu=0.99, epochs=100000): # This is not very flexible, as you would ideally # like to be able to activate any node in any hidden # layer, not just the last layer. # Exercise for students: modify this function to be able # to activate neurons in the middle layers. X0 = init_weights((1, self.D)) X = theano.shared(X0, 'X_shared') dX = theano.shared(np.zeros(X0.shape), 'dX_shared') Y = self.forward(X) t = np.zeros(self.hidden_layers[-1].M) t[k] = 1 cost = -(t*T.log(Y[0]) + (1 - t)*(T.log(1 - Y[0]))).sum() updates = [ (X, X + mu*dX - learning_rate*T.grad(cost, X)), (dX, mu*dX - learning_rate*T.grad(cost, X)), ] train = theano.function( inputs=[], outputs=cost, updates=updates, ) costs = [] for i in xrange(epochs): if i % 1000 == 0: print "epoch:", i the_cost = train() costs.append(the_cost) plt.plot(costs) plt.show() return X.get_value()
def __init__(self, M1, M2): self.M1 = M1 self.M2 = M2 W = init_weights(M1, M2) b = np.zeros(M2).astype(np.float32) self.W = theano.shared(W, 'W') self.b = theano.shared(b, 'b') self.params = [self.W, self.b]
def __init__(self, m1, m2): # m1: input size & m2: output size W = init_weights((m1, m2)) bi = np.zeros(m2) bo = np.zeros(m1) self.W = theano.shared(W) self.bi = theano.shared(bi) # input bias self.bo = theano.shared(bo) # output bias self.params = [self.W, self.bi, self.bo]
def __init__(self, m1, m2): W = init_weights((m1, m2)) bi = np.zeros(m2, dtype=np.float32) bo = np.zeros(m1, dtype=np.float32) self.W = theano.shared(W) self.bi = theano.shared(bi) self.bo = theano.shared(bo) self.params = [self.W, self.bi, self.bo]
def fit_to_input(self, k, learning_rate=0.00001, mu=0.99, reg=10e-10, epochs=20000): # This is not very flexible, as you would ideally # like to be able to activate any node in any hidden # layer, not just the last layer. # Exercise for students: modify this function to be able # to activate neurons in the middle layers. X0 = init_weights((1, self.D)) X = theano.shared(X0, 'X_shared') dX = theano.shared(np.zeros(X0.shape), 'dX_shared') Y = self.forward(X) # t = np.zeros(self.hidden_layers[-1].M) # t[k] = 1 # # choose Y[0] b/c it's shape 1xD, we want just a D-size vector, not 1xD matrix # cost = -(t*T.log(Y[0]) + (1 - t)*(T.log(1 - Y[0]))).sum() + reg*(X * X).sum() cost = -T.log(Y[0, k]) + reg * (X * X).sum() updates = [ (X, X + mu * dX - learning_rate * T.grad(cost, X)), (dX, mu * dX - learning_rate * T.grad(cost, X)), ] train = theano.function( inputs=[], outputs=[cost, Y], updates=updates, ) costs = [] bestX = None for i in xrange(epochs): if i % 1000 == 0: print "epoch:", i the_cost, out = train() if i == 0: print "out.shape:", out.shape costs.append(the_cost) # if the_cost < 10: # break if the_cost > costs[-1] or np.isnan(the_cost): break bestX = X.get_value() print "len(costs):", len(costs), "max:", np.max(costs), "min:", np.min( costs) plt.plot(costs) plt.show() return bestX
def __init__(self, expr, steps, batchsize=32, constsize=0, rand=None, update_fn=lasagne.updates.adam, lamb=10.0, binary_ops=DEFAULT_BINARY_OPS, unary_ops=DEFAULT_UNARY_OPS ): super(ProcessorNetwork, self).__init__(expr, batchsize=batchsize, rand=rand, update_fn=update_fn, lamb=lamb ) # self._one = T.constant(1.0) self._steps = steps self._constsize = constsize self._binary_ops = binary_ops self._unary_ops = unary_ops if constsize > 0: self._constants = init_weights([constsize]) self._W_read = [init_weights([3, self.total_readables(t)]) for t in range(steps)] # self._W_scale = theano.shared(np.ones([steps, 3])) self._W_select = init_weights([steps, len(binary_ops) + len(unary_ops)]) # self._params = self._W_read + [self._W_scale, self._W_select] self._params = self._maybe_constants(self._W_read + [self._W_select]) # Regularize self._regularization = T.sum( [penalize_hedging(self._W_read[t][i]) for t in range(steps) for i in range(3)] + [penalize_hedging(self._W_select[t]) for t in range(steps)] ) self._build()
def fit(self, X, lr=10e-4, mu=0.99): N = len(X) M = self.M D = self.D V = self.V #Initialize weights We = init_weights(V, D) Wx = init_weights(D, M) Wh = init_weights(M, M) bh = np.zeros(M).astype(np.float32) h0 = np.zeros(M).astype(np.float32) Wo = init_weights(M, V) bo = np.zeros(V).astype(np.float32) #Create all the theano variables and equations for training and prediction self.set(We, Wx, Wh, bh, h0, Wo, bo, np.float32(lr), np.float32(mu)) #Stochastic Gradient Descent for n in range(2000): n_total = 0 n_correct = 0 tot_cost = 0 if n % 10 == 0: lr *= 0.99 for i in range(N): line = X[i] n_total += len(line) in_seq = [0] + line out_seq = line + [1] #print(in_seq, out_seq) p, c = self.train(in_seq, out_seq) for i in range(len(p)): if p[i] == out_seq[i]: n_correct += 1 tot_cost += c print("iteration:", n, "Cost: ", tot_cost, "classification-rate:", float(n_correct) / n_total) self.save()
def __init__(self,hidden_layer_sizes,keep_probs): self.hidden_layer_sizes = hidden_layer_sizes self.keep_probs = keep_probs #initiate parameters except the first and final layer self.all_params = [] m1 = self.hidden_layer_sizes[0] for m2 in hidden_layer_sizes[1:]: w_init,b_init = init_weights(m1,m2) W = tf.Variable(w_init) b = tf.Variable(b_init) self.all_params += [(W,b)] m1=m2
def fit_to_input(self, k, learning_rate=1.0, mu=0.99, epochs=100000): # This is not very flexible, as you would ideally # like to be able to activate any node in any hidden # layer, not just the last layer. # Exercise for students: modify this function to be able # to activate neurons in the middle layers. # cast hyperperams learning_rate = np.float32(learning_rate) mu = np.float32(mu) # randomly initialize an image X0 = init_weights((1, self.D)) # make the image a shared so theano can update it X = theano.shared(X0, 'X_shared') # get the output of the neural network Y = self.forward(X) # t = np.zeros(self.hidden_layers[-1].M) # t[k] = 1 # # choose Y[0] b/c it's shape 1xD, we want just a D-size vector, not 1xD matrix # cost = -(t*T.log(Y[0]) + (1 - t)*(T.log(1 - Y[0]))).sum() # k = which output node to look at # there is only 1 image, so we select the 0th row of X cost = -T.log(Y[0,k]) updates = momentum_updates(cost, [X], mu, learning_rate) train = theano.function( inputs=[], outputs=[cost, Y], updates=updates, ) costs = [] for i in range(epochs): if i % 10000 == 0: print("epoch:", i) the_cost, out = train() if i == 0: print("out.shape:", out.shape) costs.append(the_cost) plt.plot(costs) plt.show() return X.get_value()
def fit_to_input(self, k, learning_rate=0.00001, mu=0.99, reg=10e-10, epochs=20000): # This is not very flexible, as you would ideally # like to be able to activate any node in any hidden # layer, not just the last layer. # Exercise for students: modify this function to be able # to activate neurons in the middle layers. X0 = init_weights((1, self.D)) X = theano.shared(X0, 'X_shared') dX = theano.shared(np.zeros(X0.shape), 'dX_shared') Y = self.forward(X) # t = np.zeros(self.hidden_layers[-1].M) # t[k] = 1 # # choose Y[0] b/c it's shape 1xD, we want just a D-size vector, not 1xD matrix # cost = -(t*T.log(Y[0]) + (1 - t)*(T.log(1 - Y[0]))).sum() + reg*(X * X).sum() cost = -T.log(Y[0,k]) + reg*(X * X).sum() updates = [ (X, X + mu*dX - learning_rate*T.grad(cost, X)), (dX, mu*dX - learning_rate*T.grad(cost, X)), ] train = theano.function( inputs=[], outputs=[cost, Y], updates=updates, ) costs = [] bestX = None for i in xrange(epochs): if i % 1000 == 0: print "epoch:", i the_cost, out = train() if i == 0: print "out.shape:", out.shape costs.append(the_cost) # if the_cost < 10: # break if the_cost > costs[-1] or np.isnan(the_cost): break bestX = X.get_value() print "len(costs):", len(costs), "max:", np.max(costs), "min:", np.min(costs) plt.plot(costs) plt.show() return bestX
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, reg=0.1, epochs=1, batch_sz=100): # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.hidden_op(current_input) # initialize logistic regression layer N = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, "W_logreg") self.b = theano.shared(np.zeros(K), "b_logreg") self.params = [self.W, self.b] for ae in self.hidden_layers: self.params += ae.forward_params # for momentum self.dW = theano.shared(np.zeros(W0.shape), "dW_logreg") self.db = theano.shared(np.zeros(K), "db_logreg") self.dparams = [self.dW, self.db] for ae in self.hidden_layers: self.dparams += ae.forward_dparams X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) # squared_magnitude = [(p*p).sum() for p in self.params] # reg_cost = T.sum(squared_magnitude) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) #+ reg*reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)] # updates = [(p, p - learning_rate*T.grad(cost, p)) for p in self.params] train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N / batch_sz costs = [] print "supervised training..." for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print "j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error costs.append(the_cost) plt.plot(costs) plt.show()
def __init__(self, D, M): W = init_weights((D, M)) b = np.zeros(M) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b]
def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False): N, D = X.shape n_batches = N / batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M), 'bh_%s' % self.id) self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [ self.W, self.bh ] # the deep neural network class will need to use these # TODO: technically these should be reset before doing backprop # defining the changes in each variable, since we're using momentum self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] # tensor input (matrix) X_in = T.matrix('X_%s' % self.id) X_hat = self.forward_output(X_in) # the reconstruction # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid # defining the hidden layer operation as a theano functions since it will be used in the deep neural network class. H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # squared error cost function: # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N # cross entropy cost function: cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D) cost_op = theano.function( inputs=[X_in], outputs=cost, ) # gradient descent: updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print("training autoencoder: %s" % self.id) for i in range(epochs): print("epoch:", i) X = shuffle(X) for j in range(n_batches): batch = X[j * batch_sz:(j * batch_sz + batch_sz)] train_op(batch) the_cost = cost_op( X) # technically we could also get the cost for Xtest here print("j / n_batches:", j, "/", n_batches, "cost:", the_cost) costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def aup(paras): total_anchor = paras.total_anchor #train_ratio = paras.train_ratio load_path_a = paras.feature_A load_path_b = paras.feature_B cuda = torch.device("cuda:0") dim = 56 #paras.represent_dim lr = paras.lr lr_step = paras.lr_step lr_prob = paras.lr_prob N = paras.N stop_P = paras.stop_P is_classification = paras.is_classification represent_epoch = paras.represent_epoch classification_epoch = paras.classification_epoch a_array_load = np.load(load_path_a) b_array_load = np.load(load_path_b) a_array_tensor = torch.Tensor(a_array_load) b_array_tensor = torch.Tensor(b_array_load) len_f = a_array_load.shape[0] len_t = b_array_load.shape[0] print(len_f, len_t) node_f = list(range(0, len_f)) node_t = list(range(0, len_t)) anchor_all = list(range(0, total_anchor)) rd.seed(80) left_anchor, right_anchor = data.get_train_anchor() #anchor_train = rd.choice(anchor_all, int(train_ratio * total_anchor)) #anchor_test = list(set(anchor_all) - set(anchor_train)) anchor_test = data.get_test_anchor() model = SiameseNetwork(dim, len_f, len_t).to(device=cuda) init_weights(model) neta = NETA(len_f, dim).to(device=cuda) netb = NETB(len_t, dim).to(device=cuda) a_array_tensor = a_array_tensor.to(device=cuda) b_array_tensor = b_array_tensor.to(device=cuda) mse = nn.MSELoss() cos = nn.CosineEmbeddingLoss(margin=0) optimizer = optim.Adadelta(model.parameters(), lr=lr, weight_decay=0.001) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step, gamma=lr_prob) triplet_neg = 1 anchor_flag = 1 anchor_train_len = len(left_anchor) anchor_train_a_list = left_anchor anchor_train_b_list = right_anchor input_a = [] input_b = [] classifier_target = torch.empty(0).to(device=cuda) np.random.seed(5) index = 0 while index < anchor_train_len: #training anchor的个数 a = anchor_train_a_list[index] b = anchor_train_b_list[index] input_a.append(a) input_b.append(b) an_target = torch.ones(anchor_flag).to(device=cuda) classifier_target = torch.cat((classifier_target, an_target), dim=0) an_negs_index = list(set(node_t) - {b}) an_negs_index_sampled = list( np.random.choice(an_negs_index, triplet_neg, replace=False)) an_as = triplet_neg * [a] input_a += an_as input_b += an_negs_index_sampled an_negs_index1 = list(set(node_f) - {a}) an_negs_index_sampled1 = list( np.random.choice(an_negs_index1, triplet_neg, replace=False)) an_as1 = triplet_neg * [b] input_b += an_as1 input_a += an_negs_index_sampled1 un_an_target = torch.zeros(triplet_neg * 2).to(device=cuda) classifier_target = torch.cat((classifier_target, un_an_target), dim=0) index += 1 cosine_target = torch.unsqueeze(2 * classifier_target - 1, dim=1) classifier_target = torch.unsqueeze(classifier_target, dim=1) ina = a_array_load[input_a] inb = b_array_load[input_b] ina = torch.Tensor(ina).to(device=cuda) inb = torch.Tensor(inb).to(device=cuda) tensor_dataset = SiameseNetworkDataset(ina, inb, classifier_target, cosine_target) data_loader = DataLoader(tensor_dataset, batch_size=56, shuffle=False) hidden_a_for_c = None hidden_b_for_c = None for epoch in range(represent_epoch): model.train() scheduler.step() train_loss = 0 loss_rec_a = 0 loss_rec_b = 0 loss_reg = 0 loss_anchor = 0 for data_batch in data_loader: in_a, in_b, c, cosine = data_batch cosine = torch.squeeze(cosine, dim=1) in_a = torch.unsqueeze(in_a, dim=1).to(device=cuda) in_b = torch.unsqueeze(in_b, dim=1).to(device=cuda) h_a, h_b, re_a, re_b = model(in_a, in_b) loss_rec_a_batch = 100 * mse(re_a, in_a) loss_rec_b_batch = 100 * mse(re_b, in_b) loss_anchor_batch = 1 * cos(h_a, h_b, cosine) loss_reg_batch = 0.001 * (h_a.norm() + h_b.norm()) loss = loss_reg_batch + loss_rec_a_batch + loss_rec_b_batch + loss_anchor_batch optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() loss_rec_a += loss_rec_a_batch.item() loss_rec_b += loss_rec_b_batch.item() loss_reg += loss_reg_batch.item() loss_anchor += loss_anchor_batch.item() neta_dict = neta.state_dict() netb_dict = netb.state_dict() model.cpu() trainmodel_dict = model.state_dict() trainmodel_dict_a = { k: v for k, v in trainmodel_dict.items() if k in neta_dict } trainmodel_dict_b = { k: v for k, v in trainmodel_dict.items() if k in netb_dict } neta_dict.update(trainmodel_dict_a) netb_dict.update(trainmodel_dict_b) neta.load_state_dict(neta_dict) netb.load_state_dict(netb_dict) neta.eval() netb.eval() hidden_a = neta(torch.unsqueeze(a_array_tensor, dim=1)) hidden_b = netb(torch.unsqueeze(b_array_tensor, dim=1)) psenode = [] for i in range(5313, 5469): #modify with training ratio psenode.append(i) PatN_v, MatN_v, pp1, pp5, pp10, pp15, pp20, pp25, pp30 = tes_vec( hidden_a, hidden_b, left_anchor, right_anchor, anchor_test, N, node_t, psenode) PatN_t, MatN_t, p1, p5, p10, p15, p20, p25, p30 = tes_vec( hidden_a, hidden_b, anchor_test, anchor_test, right_anchor, N, node_t) print( 'epoch:%d, loss:%.3f, rec_a:%.3f, rec_b:%.3f, anchor:%.3f, reg:%.3f, ' 'at%d, Val(P=%.3f, M=%.3f), Tes(P=%.3f, M=%.3f)\n,Test(%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f)' % (epoch, train_loss, loss_rec_a, loss_rec_b, loss_anchor, loss_reg, N, PatN_v, MatN_v, PatN_t, MatN_t, p1, p5, p10, p15, p20, p25, p30)) if is_classification and PatN_t > stop_P: hidden_a_for_c = hidden_a.detach() hidden_b_for_c = hidden_b.detach() break model.to(device=cuda) if is_classification: classifier = Classifier().to(device=cuda) cel = nn.CrossEntropyLoss() hidden_a_for_c = hidden_a_for_c.cpu().numpy() hidden_b_for_c = hidden_b_for_c.cpu().numpy() ina_for_c = hidden_a_for_c[input_a] inb_for_c = hidden_b_for_c[input_b] ina_for_c = torch.Tensor(ina_for_c).to(device=cuda) inb_for_c = torch.Tensor(inb_for_c).to(device=cuda) tensor_dataset_for_c = SiameseNetworkDataset(ina_for_c, inb_for_c, classifier_target, cosine_target) data_loader_for_c = DataLoader(tensor_dataset_for_c, batch_size=dim, shuffle=False) optimizer_for_c = optim.Adadelta(classifier.parameters(), lr=lr, weight_decay=0.0001) scheduler_c = torch.optim.lr_scheduler.StepLR(optimizer_for_c, step_size=lr_step, gamma=lr_prob) # classifier for epoch in range(classification_epoch): classifier.train() scheduler_c.step() loss_c = 0 for data_batch in data_loader_for_c: in_a, in_b, c, cosine = data_batch in_a, in_b = in_a.to(device=cuda), in_b.to(device=cuda) in_class = torch.cat((in_a, in_b), dim=1) class_out = classifier(in_class) c = torch.squeeze(c, dim=1) loss_classifier = cel(class_out, c.long()) optimizer_for_c.zero_grad() loss_classifier.backward() optimizer_for_c.step() loss_c += loss_classifier.item() classifier.eval() hidden_a_for_c1 = torch.Tensor(hidden_a_for_c).to(device=cuda) hidden_b_for_c1 = torch.Tensor(hidden_b_for_c).to(device=cuda) PatN_v, MatN_v, pp1, pp5, pp10, pp15, pp20, pp25, pp30 = val_classifier( hidden_a_for_c1, hidden_b_for_c1, left_anchor, right_anchor, anchor_test, paras, node_t, classifier) PatN_t, MatN_t, p1, p5, p10, p15, p20, p25, p30, = val_classifier( hidden_a_for_c1, hidden_b_for_c1, anchor_test, anchor_test, right_anchor, paras, node_t, classifier) print( 'epoch %d, loss %.3f, at%d, Val(P=%.3f, M=%.3f), Tes(P=%.3f, M=%.3f)\n,Test(%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f)' % (epoch, loss_c, N, PatN_v, MatN_v, PatN_t, MatN_t, p1, p5, p10, p15, p20, p25, p30))
def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False): # cast to float mu = np.float64(mu) learning_rate = np.float64(learning_rate) Linhas, Colunas = X.shape n_batches = Linhas // batch_sz W0 = init_weights((Colunas, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M, dtype=np.float64), 'bh_%s' % self.id) self.bo = theano.shared(np.zeros(Colunas, dtype=np.float64), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [self.W, self.bh] # TODO: technically these should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape, dtype=np.float64), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M, dtype=np.float64), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(Colunas, dtype=np.float64), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] X_in = T.matrix('X_%s' % self.id) X_hat = self.forward_output(X_in) # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # save this for later so we can call it to # create reconstructions of input self.predict = theano.function( inputs=[X_in], outputs=X_hat, ) cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).flatten().mean() cost_op = theano.function( inputs=[X_in], outputs=cost, ) updates = momentum_updates(cost, self.params, mu, learning_rate) train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print("training autoencoder: %s" % self.id) print("epochs to do:", epochs) for i in range(epochs): print("epoch:", i) X = shuffle(X) for j in range(n_batches): batch = X[j * batch_sz:(j * batch_sz + batch_sz)] train_op(batch) the_cost = cost_op(batch) # technically we could also get the cost for Xtest here # if j % 10 == 0: print("j / n_batches:", j, "/", n_batches, "cost:", the_cost) costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=0.001, mu=0.99): M = self.M V = self.V K = len(set(Y)) #K = 2 lr = np.float32(lr) mu = np.float32(mu) #Form train and test data set XTrain = X[:-50] YTrain = Y[:-50] XTest = X[-50:] YTest = Y[-50:] N = len(XTrain) print(Y) #Initial weights Wx = init_weights(V, M) Wh = init_weights(M, M) bh = np.zeros(M).astype(np.float32) h0 = np.zeros(M).astype(np.float32) Wo = init_weights(M, K) bo = np.zeros(K).astype(np.float32) #Theano Variables self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] # self.dparams = [theano.shared(np.zeros(p.get_value().shape).astype(np.float32)) for p in self.params] thX = T.ivector('X') #T size vector thY = T.iscalar( 'Y') #Output, i.e, 0 for robert frost, 1 for edgar allan #Recurrence to loop through the input sequence def recurrence(x_t, h_t_prev): h_t = T.tanh(self.Wx[x_t] + h_t_prev.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, sequences=thX, n_steps=thX.shape[0], outputs_info=[self.h0, None], ) #Prediction and cost calculation pY = y[-1, 0, :] #y is T x 1 x K pred = T.argmax(pY) cost = -T.mean(T.log(pY[thY])) updates = [(p, p - lr * T.grad(cost, p)) for p in self.params] # + [ # (d, mu*d - lr*T.grad(cost,p)) for p,d in zip(self.params, self.dparams) # ] #Training and prediction function train = theano.function(inputs=[thX, thY], updates=updates, outputs=pY) get_pred_cost = theano.function(inputs=[thX, thY], outputs=[pred, cost]) #Stochastic gradient descent for i in range(500): XTrain, YTrain = shuffle(XTrain, YTrain) lr = lr * 0.9 for n in range(N): x = XTrain[n] y = YTrain[n] p = train(x, y) #Test set n_correct = 0 tot_c = 0 for j in range(len(XTest)): p, c = get_pred_cost(XTest[j], YTest[j]) if p == YTest[j]: n_correct += 1 tot_c += c print("Iteration: ", i, "Cost: ", tot_c, "Classification rate: ", float(n_correct) / len(XTest))
def fit(self, X, Y, lr=10e-7, mu=0.99, batch_sz=100): Y = Y.astype(np.int32) X, Y = shuffle(X,Y) N, c, d, d = X.shape print(len(Y)) K = Y.shape[1] mu = np.float32(mu) lr = np.float32(lr) print("N:", N, "K:", K) #Create the convolution-pooling layers self.convpool_layers=[] mi = c outw = d outh = d for mo, fw, fh in self.convpool_layer_sizes: c = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(c) outw = (outw - fw +1)/ 2 outh = (outh - fh +1)/ 2 mi = mo #Create the hidden layers self.hidden_layers = [] m1 = int(self.convpool_layer_sizes[-1][0]*outw*outh) for m2 in self.hidden_layer_sizes: h = HiddenLayer(m1, m2) self.hidden_layers.append(h) m1 = m2 W = init_weights(m2, K) #Logistic reg layer b = np.zeros([K]).astype(np.float32) #Create theano variables thX = T.tensor4('X', dtype='float32') thY = T.fmatrix('Y') self.W = theano.shared(W, 'W_log') self.b = theano.shared(b, 'b_log') #Create parameter array for updates params = [self.W, self.b] for c in self.convpool_layers: params += c.params for h in self.hidden_layers: params += h.params #Momentum parameters dparams = [theano.shared(np.zeros(p.get_value().shape).astype(np.float32)) for p in params] #Forward pass pY = self.forward(thX) P = T.argmax(pY, axis=1) cost = -(thY * T.log(pY)).sum() #Weight updates updates = [ (p, p + mu*d - lr*T.grad(cost, p)) for p,d in zip(params, dparams) ] + [ (d, mu*d - lr*T.grad(cost, p)) for p,d in zip(params, dparams) ] #Theano function for training and predicting and calculating cost train = theano.function( inputs=[thX, thY], updates=updates, allow_input_downcast=True ) get_cost_prediction = theano.function( inputs=[thX, thY], outputs=[P, cost], allow_input_downcast=True ) #Loop for Batch grad descent no_batches = int(N/batch_sz) for i in range(500): #lr *= 0.9 for n in range(no_batches): Xbatch = X[n*batch_sz:(n*batch_sz+batch_sz)] Ybatch = Y[n*batch_sz:(n*batch_sz+batch_sz)] #print(Xbatch.shape, Ybatch.shape) train(Xbatch, Ybatch) if n%100==0: Yb = np.argmax(Ybatch, axis =1) P, c = get_cost_prediction(Xbatch, Ybatch) #print(P.shape, Ybatch.shape) er = error_rate(P, Yb) print("iteration:", i, "cost:", c, "error rate:", er)
def fit(self, X, activation=relu, lr=0.5, epochs=1, mu=0.99, batch_sz=20, print_period=100, show_fig=False): # X = X.astype(np.float32) mu = np.float32(mu) lr = np.float32(lr) # init hidden layers N, D = X.shape n_batches = N // batch_sz # HiddenLayer could do this but i dont know whats up with the ids W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M, dtype=np.float32), 'bh_%s' % self.id) self.bo = theano.shared(np.zeros(D, dtype=np.float32), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [self.W, self.bh] # shit for momentum # TODO: technically these should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] X_in = T.matrix('X_%s' % self.id) X_hat = self.forward_output(X_in) H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) self.predict = theano.function( inputs=[X_in], outputs=X_hat, ) # mse # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N #mean or sum and mse as cost function # cross entropy cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).flatten().mean() cost_op = theano.function( inputs=[X_in], outputs=cost, ) # grad descent + adding momentum changes updates = momentum_updates(cost, self.params, mu, lr) train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print("training autoencoder: %s" % self.id) print("epochs to do:", epochs) for i in range(epochs): print("epoch:", i) X = shuffle(X) for j in range(n_batches): batch = X[j * batch_sz:(j * batch_sz + batch_sz)] train_op(batch) the_cost = cost_op( batch ) # technically we could also get the cost for Xtest here if j % 10 == 0: print("j / n_batches:", j, "/", n_batches, "cost:", the_cost) costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
for ae in self.hidden_layers: Z = ae.forward_hidden(Z) return Z <<<<<<< HEAD def fit_to_input(self, k, learning_rate=0.00001, mu=0.99, reg=10e-10, epochs=20000): ======= def fit_to_input(self, k, learning_rate=1.0, mu=0.99, epochs=100000): >>>>>>> upstream/master # This is not very flexible, as you would ideally # like to be able to activate any node in any hidden # layer, not just the last layer. # Exercise for students: modify this function to be able # to activate neurons in the middle layers. <<<<<<< HEAD X0 = init_weights((1, self.D)) X = theano.shared(X0, 'X_shared') dX = theano.shared(np.zeros(X0.shape), 'dX_shared') Y = self.forward(X) ======= # cast hyperperams learning_rate = np.float32(learning_rate) mu = np.float32(mu) # randomly initialize an image X0 = init_weights((1, self.D)) # make the image a shared so theano can update it X = theano.shared(X0, 'X_shared')
def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False): N, D = X.shape n_batches = N / batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.c = theano.shared(np.zeros(self.M), 'c_%s' % self.id) self.b = theano.shared(np.zeros(D), 'b_%s' % self.id) self.params = [self.W, self.c, self.b] self.forward_params = [self.W, self.c] # we won't use this to fit the RBM but we will use these for backpropagation later # TODO: technically they should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dc = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.db = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dc, self.db] self.forward_dparams = [self.dW, self.dc] X_in = T.matrix('X_%s' % self.id) # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.c) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # we won't use this cost to do any updates # but we would like to see how this cost function changes # as we do contrastive divergence X_hat = self.forward_output(X_in) cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D) cost_op = theano.function( inputs=[X_in], outputs=cost, ) # do one round of Gibbs sampling to obtain X_sample H = self.sample_h_given_v(X_in) X_sample = self.sample_v_given_h(H) # define the objective, updates, and train function objective = T.mean(self.free_energy(X_in)) - T.mean(self.free_energy(X_sample)) # need to consider X_sample constant because you can't take the gradient of random numbers in Theano updates = [(p, p - learning_rate*T.grad(objective, p, consider_constant=[X_sample])) for p in self.params] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print "training rbm: %s" % self.id for i in xrange(epochs): print "epoch:", i X = shuffle(X) for j in xrange(n_batches): batch = X[j*batch_sz:(j*batch_sz + batch_sz)] train_op(batch) the_cost = cost_op(X) # technically we could also get the cost for Xtest here print "j / n_batches:", j, "/", n_batches, "cost:", the_cost costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False): N, D = X.shape n_batches = N / batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.c = theano.shared(np.zeros(self.M), 'c_%s' % self.id) self.b = theano.shared(np.zeros(D), 'b_%s' % self.id) self.params = [self.W, self.c, self.b] self.forward_params = [self.W, self.c] # we won't use this to fit the RBM(momentum isnt used in this RBM) but we will use these for backpropagation later # TODO: technically they should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dc = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.db = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dc, self.db] self.forward_dparams = [self.dW, self.dc] # define our input: X_in = T.matrix('X_%s' % self.id) # define our hidden op which is used for our layer wise pretraining # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.c) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # we won't use this cost to do any updates # but we would like to see how this cost function changes # as we do contrastive divergence X_hat = self.forward_output(X_in) cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D) cost_op = theano.function( inputs=[X_in], outputs=cost, ) # do one round of Gibbs sampling to obtain X_sample H = self.sample_h_given_v(X_in) X_sample = self.sample_v_given_h(H) # define the objective which is free energy of visible 0 minus the free energy of visible 1, updates, and train function # we're taking the mean since we're doing batch training objective = T.mean(self.free_energy(X_in)) - T.mean(self.free_energy(X_sample)) # need to consider X_sample constant because you can't take the gradient of random numbers in Theano updates = [(p, p - learning_rate*T.grad(objective, p, consider_constant=[X_sample])) for p in self.params] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print("training rbm: %s" % self.id) for i in range(epochs): print("epoch:", i) X = shuffle(X) for j in range(n_batches): batch = X[j*batch_sz:(j*batch_sz + batch_sz)] train_op(batch) the_cost = cost_op(X) # technically we could also get the cost for Xtest here print("j / n_batches:", j, "/", n_batches, "cost:", the_cost) costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
losses = np.empty(decay.size) test_accs = np.empty(decay.size) for idx, decay_rate in enumerate(decay): np.random.seed( 7 ) # seed NumPy's random number generator for reproducibility of results # Initialize neural network nn = MLPClassifier(hidden_layer_sizes=(5, 2), random_state=7, max_iter=1, warm_start=True) nn.fit(X_train, y_train) # Initialize weights nn.coefs_, nn.intercepts_ = init_weights(X_train.shape[1], list(nn.hidden_layer_sizes)) loss_next = compute_loss(X_train, y_train, nn) T = T_init loss = [] start = time.time() for i in range(num_iters): # Save current parameters coefs_prev = nn.coefs_ intercepts_prev = nn.intercepts_ loss_prev = loss_next if debug: print('Iteration # %d' % i) print('Loss = ', loss_prev)
def fit(self, X, Y, Xtest, Ytest, pretrain=True, train_head_only=False, learning_rate=0.1, mu=0.99, reg=0.0, epochs=1, batch_sz=100): # cast to float64 learning_rate = np.float64(learning_rate) mu = np.float64(mu) reg = np.float64(reg) # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.hidden_op(current_input) # initialize logistic regression layer Linhas = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0.astype(np.float64), "W_logreg") self.b = theano.shared(np.zeros(K, dtype=np.float64), "b_logreg") self.params = [self.W, self.b] if not train_head_only: for ae in self.hidden_layers: self.params += ae.forward_params X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) squared_magnitude = [(p * p).sum() for p in self.params] reg_cost = T.sum(squared_magnitude) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) + reg * reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = momentum_updates(cost, self.params, mu, learning_rate) train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = Linhas // batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print("j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error) costs.append(the_cost) plt.plot(costs) plt.show()
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, reg=0.1, epochs=1, batch_sz=100): # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.hidden_op(current_input) # initialize logistic regression layer N = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, "W_logreg") self.b = theano.shared(np.zeros(K), "b_logreg") self.params = [self.W, self.b] for ae in self.hidden_layers: self.params += ae.forward_params # for momentum self.dW = theano.shared(np.zeros(W0.shape), "dW_logreg") self.db = theano.shared(np.zeros(K), "db_logreg") self.dparams = [self.dW, self.db] for ae in self.hidden_layers: self.dparams += ae.forward_dparams X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) # squared_magnitude = [(p*p).sum() for p in self.params] # reg_cost = T.sum(squared_magnitude) cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) #+ reg*reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = [ (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [ (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] # updates = [(p, p - learning_rate*T.grad(cost, p)) for p in self.params] train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N // batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print("j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error) costs.append(the_cost) plt.plot(costs) plt.show()
def __init__(self, m1, m2): W = init_weights((m1, m2))
def nsf(paras): cuda = torch.device('cuda:' + str(paras.gpu_id)) len_anchor = paras.total_anchor anchor_all = list(range(0, len_anchor)) len_s = paras.len_s len_t = paras.len_t node_f1 = list(range(0, len_s)) node_f2 = list(range(0, len_t)) feature_s = paras.feature_s feature_t = paras.feature_t dim = paras.represent_dim ker_size = paras.ker_size coefficient = paras.coefficient epoch = paras.epoch ratio = paras.train_ratio margin = paras.epsilon lr = paras.lr lr_step = paras.lr_step lr_prob = paras.lr_prob a_array_load = np.load(feature_s) a_array_tensor = torch.Tensor(a_array_load) b_array_load = np.load(feature_t) b_array_tensor = torch.Tensor(b_array_load) seeds = list(np.random.randint(0, 10000, 4)) seed1 = seeds[0] seed2 = seeds[1] torch.cuda.manual_seed_all(seeds[2]) torch.manual_seed(seeds[3]) rd.seed(seed1) anchor_train = rd.choice(anchor_all, int(ratio * len_anchor)) anchor_test = list(set(anchor_all) - set(anchor_train)) triplet_neg = 1 anchor_flag = 1 anchor_train_len = len(anchor_train) anchor_train_a_list = anchor_train anchor_train_b_list = anchor_train input_a = [] input_b = [] classifier_target = torch.empty(0, 0).to(device=cuda) np.random.seed(seed2) index = 0 while index < anchor_train_len: a = anchor_train_a_list[index] b = anchor_train_b_list[index] input_a.append(a) input_b.append(b) an_target = torch.ones(anchor_flag).to(device=cuda) classifier_target = torch.cat((classifier_target, an_target), dim=0) an_negs_index = list(set(node_f2) - {b}) an_negs_index_sampled = list( np.random.choice(an_negs_index, triplet_neg, replace=False)) an_as = triplet_neg * [a] input_a += an_as input_b += an_negs_index_sampled an_negs_index1 = list(set(node_f1) - {a}) an_negs_index_sampled1 = list( np.random.choice(an_negs_index1, triplet_neg, replace=False)) an_as1 = triplet_neg * [b] input_b += an_as1 input_a += an_negs_index_sampled1 un_an_target = torch.zeros(triplet_neg * 2).to(device=cuda) classifier_target = torch.cat((classifier_target, un_an_target), dim=0) index += 1 cosine_target = torch.unsqueeze(2 * classifier_target - 1, dim=1) classifier_target = torch.unsqueeze(classifier_target, dim=1) ina = a_array_load[input_a] inb = b_array_load[input_b] ina = torch.Tensor(ina).to(device=cuda) inb = torch.Tensor(inb).to(device=cuda) tensor_dataset = SiameseNetworkDataset(ina, inb, classifier_target, cosine_target) data_loader = DataLoader(tensor_dataset, batch_size=56, shuffle=False) P, M = 0, 0 model = SiameseNetwork(dim, ker_size, len_s, len_t).to(device=cuda) init_weights(model) neta = NETA(dim, ker_size, len_s).to(device=cuda) netb = NETB(dim, ker_size, len_t).to(device=cuda) a_array_tensor = a_array_tensor.to(device=cuda) b_array_tensor = b_array_tensor.to(device=cuda) cos = nn.CosineEmbeddingLoss(margin=0) optimizer = optim.Adadelta(model.parameters(), lr=lr, weight_decay=0.001) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step, gamma=lr_prob) for epoch in range(epoch): model.train() scheduler.step() train_loss = 0 loss_reg = 0 loss_anchor = 0 for data_batch in data_loader: in_a, in_b, c, cosine = data_batch cosine = torch.squeeze(cosine, dim=1) in_a = torch.unsqueeze(in_a, dim=1).to(device=cuda) in_b = torch.unsqueeze(in_b, dim=1).to(device=cuda) h_a, h_b = model(in_a, in_b) loss_anchor_batch = 1 * cos(h_a, h_b, cosine) loss_reg_batch = coefficient * (h_a.norm() + h_b.norm()) loss = loss_reg_batch + loss_anchor_batch optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() loss_reg += loss_reg_batch.item() loss_anchor += loss_anchor_batch.item() neta_dict = neta.state_dict() netb_dict = netb.state_dict() model.cpu() trainmodel_dict = model.state_dict() trainmodel_dict_a = { k: v for k, v in trainmodel_dict.items() if k in neta_dict } trainmodel_dict_b = { k: v for k, v in trainmodel_dict.items() if k in netb_dict } neta_dict.update(trainmodel_dict_a) netb_dict.update(trainmodel_dict_b) neta.load_state_dict(neta_dict) netb.load_state_dict(netb_dict) neta.eval() netb.eval() hidden_a = neta(torch.unsqueeze(a_array_tensor, dim=1)) hidden_b = netb(torch.unsqueeze(b_array_tensor, dim=1)) if epoch >= epoch - 30: PatN_t, MatN_t = tes_vec(hidden_a, hidden_b, anchor_train, anchor_test, node_f2) P += PatN_t M += MatN_t model.to(device=cuda) logging.info('%d %d %d %d %d %.4f %.1f %d %d %.3f %.3f' % (seeds[0], seeds[1], seeds[2], seeds[3], ker_size, coefficient, margin, ratio, dim, P / 30, M / 30))
def fit(self, X, Y, learning_rate=0.01, mu=0.99, epochs=30, batch_sz=100): N, D = X.shape K = len(set(Y)) self.hidden_layers = [] mi = D for mo in self.hidden_layer_sizes: h = HiddenLayer(mi, mo) self.hidden_layers.append(h) mi = mo # initialize logistic regression layer W = init_weights((mo, K)) b = np.zeros(K) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W) X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) prediction = self.predict(X_in) # cost_predict_op = theano.function( # inputs=[X_in, targets], # outputs=[cost, prediction], # ) dparams = [theano.shared(p.get_value()*0) for p in self.params] grads = T.grad(cost, self.params) updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] train_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], updates=updates, ) n_batches = N / batch_sz costs = [] lastWs = [W.get_value() for W in self.allWs] W_changes = [] print "supervised training..." for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % 100 == 0: print "j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error_rate(p, Ybatch) costs.append(c) # log changes in all Ws W_change = [np.abs(W.get_value() - lastW).mean() for W, lastW in zip(self.allWs, lastWs)] W_changes.append(W_change) lastWs = [W.get_value() for W in self.allWs] W_changes = np.array(W_changes) plt.subplot(2,1,1) for i in xrange(W_changes.shape[1]): plt.plot(W_changes[:,i], label='layer %s' % i) plt.legend() # plt.show() plt.subplot(2,1,2) plt.plot(costs) plt.show()
learning_rate = np.float32(learning_rate) mu = np.float32(mu) >>>>>>> upstream/master N, D = X.shape K = len(set(Y)) self.hidden_layers = [] mi = D for mo in self.hidden_layer_sizes: h = HiddenLayer(mi, mo) self.hidden_layers.append(h) mi = mo # initialize logistic regression layer W = init_weights((mo, K)) <<<<<<< HEAD b = np.zeros(K) ======= b = np.zeros(K, dtype=np.float32) >>>>>>> upstream/master self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W)
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, reg=0.1, epochs=1, batch_sz=100): # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: # call fit on each autoencoder successively ae.fit(current_input, epochs=pretrain_epochs) # we then calculate the output at the hidden layer, and we set that as the # current_input for the next layer # create current_input for the next layer (the next autoencoder) current_input = ae.hidden_op(current_input) # initialize logistic regression layer N = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, "W_logreg") self.b = theano.shared(np.zeros(K), "b_logreg") # we have to add the other parameters from the hidden layer self.params = [self.W, self.b] for ae in self.hidden_layers: self.params += ae.forward_params # do the same for momentum self.dW = theano.shared(np.zeros(W0.shape), "dW_logreg") self.db = theano.shared(np.zeros(K), "db_logreg") self.dparams = [self.dW, self.db] for ae in self.hidden_layers: self.dparams += ae.forward_dparams X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) """previously, we treated the targets as an indicator matrix, and the output of the neural network as a matrix of outputs. In this course and from here on out we're going to select the elements of py that would be 1, so that those are the elements in which targets is 1.""" # squared_magnitude = [(p*p).sum() for p in self.params] # reg_cost = T.sum(squared_magnitude) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) #+ reg*reg_cost # in order to calculate the error rate, we need to calculate the predictions prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)] # updates = [(p, p - learning_rate*T.grad(cost, p)) for p in self.params] train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N / batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print("j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error) costs.append(the_cost) plt.plot(costs) plt.show()
def __init__(self, D, M): W = init_weights((D, M))
import theano.tensor as T import matplotlib.pyplot as plt from sklearn.utils import shuffle from util import relu, error_rate, getKaggleMNIST, init_weights class AutoEncoder(object): def __init__(self, M, an_id): self.M = M self.id = an_id def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100 show_fig=False): N,D = X.shape n_batch = N / batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M), 'tb_%s' % self.id) self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [self.W, self.bh] self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo]
def fit(self, X, learning_rate=0.1, epochs=1, batch_sz=100, show_fig=False): # cast to float32 learning_rate = np.float32(learning_rate) N, D = X.shape n_batches = N // batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.c = theano.shared(np.zeros(self.M), 'c_%s' % self.id) self.b = theano.shared(np.zeros(D), 'b_%s' % self.id) self.params = [self.W, self.c, self.b] self.forward_params = [self.W, self.c] X_in = T.matrix('X_%s' % self.id) # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.c) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # we won't use this cost to do any updates # but we would like to see how this cost function changes # as we do contrastive divergence X_hat = self.forward_output(X_in) cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).mean() cost_op = theano.function( inputs=[X_in], outputs=cost, ) # do one round of Gibbs sampling to obtain X_sample H = self.sample_h_given_v(X_in) X_sample = self.sample_v_given_h(H) # define the objective, updates, and train function objective = T.mean(self.free_energy(X_in)) - T.mean( self.free_energy(X_sample)) # need to consider X_sample constant because you can't take the gradient of random numbers in Theano updates = [( p, p - learning_rate * T.grad(objective, p, consider_constant=[X_sample])) for p in self.params] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print("training rbm: %s" % self.id) for i in range(epochs): print("epoch:", i) X = shuffle(X) for j in range(n_batches): batch = X[j * batch_sz:(j * batch_sz + batch_sz)] train_op(batch) the_cost = cost_op( X) # technically we could also get the cost for Xtest here print("j / n_batches:", j, "/", n_batches, "cost:", the_cost) costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=0.01, mu=0.99, epochs=30, batch_sz=100): N, D = X.shape K = len(set(Y)) self.hidden_layers = [] mi = D for mo in self.hidden_layer_sizes: h = HiddenLayer(mi, mo) self.hidden_layers.append(h) mi = mo # initialize logistic regression layer W = init_weights((mo, K)) b = np.zeros(K) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W) X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) prediction = self.predict(X_in) # cost_predict_op = theano.function( # inputs=[X_in, targets], # outputs=[cost, prediction], # ) dparams = [theano.shared(p.get_value() * 0) for p in self.params] grads = T.grad(cost, self.params) updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] train_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], updates=updates, ) n_batches = N / batch_sz costs = [] lastWs = [W.get_value() for W in self.allWs] W_changes = [] print "supervised training..." for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % 100 == 0: print "j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error_rate( p, Ybatch) costs.append(c) # log changes in all Ws W_change = [ np.abs(W.get_value() - lastW).mean() for W, lastW in zip(self.allWs, lastWs) ] W_changes.append(W_change) lastWs = [W.get_value() for W in self.allWs] W_changes = np.array(W_changes) plt.subplot(2, 1, 1) for i in xrange(W_changes.shape[1]): plt.plot(W_changes[:, i], label='layer %s' % i) plt.legend() # plt.show() plt.subplot(2, 1, 2) plt.plot(costs) plt.show()
basis_L1 = init_basis_hermite(sigma_L1, bases_L1, 5) basis_L2 = init_basis_hermite(sigma_L2, bases_L2, 3) basis_L3 = init_basis_hermite(sigma_L3, bases_L3, 3) alphas_L1 = init_alphas(64, 1, bases_L1) alphas_L2 = init_alphas(64, 64, bases_L2) alphas_L3 = init_alphas(64, 64, bases_L3) w_L1 = T.sum(alphas_L1[:, :, :, None, None] * basis_L1[None, None, :, :, :], axis=2) w_L2 = T.sum(alphas_L2[:, :, :, None, None] * basis_L2[None, None, :, :, :], axis=2) w_L3 = T.sum(alphas_L3[:, :, :, None, None] * basis_L3[None, None, :, :, :], axis=2) w_L4 = init_weights((3136, 10)) #------------------------- # Set up function #------------------------- noise_l1, noise_l2, noise_l3, noise_py_x = model(X, w_L1, w_L2, w_L3, w_L4, 0.2, 0.7) l1, l2, l3, py_x = model(X, w_L1, w_L2, w_L3, w_L4, 0., 0.) y_x = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y)) params = [alphas_L1, alphas_L2, alphas_L3, w_L4] updates = adadelta(cost, params, learning_rate=lr, rho=0.95, epsilon=1e-6) train = theano.function(inputs=[X, Y, lr],