def process(self, x, y): self._x = x self._y = y p_y_given_x = T.nnet.softmax(chained_output(self.layers, x)) results = T.argmax(p_y_given_x, axis=1) self.theta = [param for layer in self.layers for param in [layer.W, layer.b]] self.errors = T.mean(T.neq(results,y)) self.cost_vector = -T.log(p_y_given_x)[T.arrange(y.shape[0]), y] self.cost = T.mean(self.cost_vector) return None
def __init__(self, rng, input, nhistory, feature,n_feat, n_in, n_out, N=4096, W=None, sparse=None,activation=None): self.input = input if W is None: W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (N*n_in + n_out)), high=numpy.sqrt(6. / (N*n_in + n_out)), size=(N*n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) else: w_values=W W = theano.shared(value=w_values, name='W',borrow=True) self.W = W if ProjectFeat and n_feat>0: if not UseDirect: self.input_feat = T.concatenate((self.input,feature[:,0:n_feat]),axis=1) lin_output = T.dot(self.input_feat, self.W) else: self.input_feat = T.concatenate((self.input,feature[:,0:n_feat]),axis=1) lin_output = self.W[T.arrange(self.input_feat.shape[0])] else: if not UseDirect: lin_output = T.dot(self.input, self.W) else: lin_output = self.W[T.arange(self.input)] for history in nhistory: if not UseDirect: if ProjectFeat and n_feat>0: history = T.concatenate((history,feature[:,n_feat:n_feat*2]),axis=1) lin_output = T.concatenate((lin_output,T.dot(history,self.W)),axis=1) else: #implemetn features lin_output = T.concatenate((lin_output,self.W[T.arange(history)]),axis=1) if n_feat==0: self.output = lin_output else: if ProjectFeat==1: self.output = lin_output else: self.output = T.concatenate((lin_output,feature),axis=1) # parameters of the model self.params = [self.W]
def process(self, x, y): self._x = x self._y = y p_y_given_x = T.nnet.softmax(chained_output(self.layers, x)) results = T.argmax(p_y_given_x, axis=1) self.theta = [ param for layer in self.layers for param in [layer.W, layer.b] ] self.errors = T.mean(T.neq(results, y)) self.cost_vector = -T.log(p_y_given_x)[T.arrange(y.shape[0]), y] self.cost = T.mean(self.cost_vector) return None
def __init__(self, D, K, hidden_lauer_sizes): # input size D outputsize K # starting learning rate and other hyperparams lr = 10e-4 mu = 0.7 decay = 0.999 # create the graph # K = number of actions self.layers = [] M1 = D for M2 in hidden_layer_sizes: layer = Hiddenlayer(M1, M2) self.layers.append(layer) M1 = M2 # final layer layer = HiddenLayer(M1, K, lambda x: x, use_bias=False) self.layer.append(layer) # get all params for gradient later params = [] for layer in self.layers: params += layer.params caches = [theano.shared(np.ones_like(p.get_value())*0.1) for p in params] velocities = [theano.shared(p.get_value()*0) for p in params] # inputs and targets X = T.matrix('X') actions = T.ivector('actions') advantages = T.vector('advantages') # calculate output and cost Z = X for layer in self.layers: Z = layer.forward(Z) action_scores = Z p_a_given_s = T.nnet.softmax(action_scores) # no onehot selected_probs = T.log(p_a_given_s[T.arrange(actions.shape[0]),actions]) cost = -T.sum(advantages * selected_probs) # specify update rule grads = T.grad(cost, params) g_update = [(p, p + v) for p, v, g in zip(params, velocities, grads)] c_update = [(c, decay*c + (1-decay)*g*g) for c, g in zip(caches, grads)] v_update = [(v, mu*v - lr*g / T.sqrt(c)) for v, c, g in zip(velocities, caches and grads)] # v_update = [(v, mu*v - lr*g) for v, g in zip(velocities, grads)] # c_update = [] updates = c_update + g_update + v_update # compile functions self.train_op = theano.function( inputs=[X, actions, advantages], updates=updates, allow_input_downcast=True ) self.predict_op = theano.function( inputs=[X], outputs=p_a_given_s, allow_input_downcast=True )
def argmax_alpha_sample(self, state, pctx, context): alpha = self.alpha(state, pctx) alpha_max = T.max(self.alpha, axis=1, keepdim=True) return T.cast(T.eq(T.arrange(alpha.shape[1])[None, :], alpha_max), 'float32')
def negative_log_likelihood(self, y): return T.mean(T.log(self.p_y_given_x)[T.arrange(y.shape[0]), y])
def negative_log_likelihood(self, y): return -T.mean(T.log(self.p_y_given_x)[T.arrange(y.shape[0]), y])
return shared_x, shared_y test_set_x, test_set_y = shared_dataset(train_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) batch_size = 500 data = train_set_x[2 * batch_size:3 * batch_size] label = train_set_y[2 * batch_size:3 * batch_size] # print data, label # zero_one_loss = T.sum(T.neq(T.argmax(p_y_given_x), y)) NLL = -1 * T.sum(T.log(p_y_given_x)[T.arrange(y.shape[0]), y]) L1 = T.sum(abs(param)) L2 = T.sum(param**2) loss = NLL + lambda_1 * L1 + lambda_2 * L2 # loss = tn.function() d_loss_wrt_params = T.grad(loss, params) updates = [(params, params - learning_rate * d_loss_wrt_params)] MSGD = tn.function([x_batch, y_batch], loss, updates=updates) for (x_batch, y_batch) in train_batches: print "Current Loss is" + MSGD(x_batch, y_batch) if stopping_condition_is_met: return params
def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): # step 1: process parameters to suitble type and preprocess input data lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Xtrain, Ytrain = X[:-1000], Y[:-1000] # step 2: initialize weights in convpool layers and mlp layers # convpool use padding='valid', convpool initialization N, c, width, height = Xtrain.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.conv_pool_size: h = ConvpoolLayer(mi, mo, fw, fh) self.convpool_layers.append(h) outw = (outw - fw + 1) // 2 outh = (outh - fh + 1) // 2 mi = mo # mlp initialization K = len(Ytrain) M1 = self.conv_pool_size[-1][0] * outw * outh count = 0 self.hidden_layers = [] for M2 in self.hidden_layer_size: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) count += 1 M1 = M2 # the output layer W, b = weights_and_bias_init(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect all parameters matrix as a list self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # step 3: theano structure and cost, prediction, and updates expression # initialize: (momentum and RMSprop) dparams = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] cache = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] thX = T.tensor4('X', dtype='float') thT = T.ivector('T') Y = self.th_forward(thX) rcost = reg * T.sum((p * p).sum() for p in self.params) cost = -T.mean(T.log(Y[T.arrange(thT.shape[0]), thT])) + rcost prediction = self.th_predict(thX) self.predict_op = theano.function(inputs=[thX], outputs=prediction) cost_predict_op = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, dparams)] train_op = theano.function(inputs=[thX, thT], updates=updates) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()