def ready(self): args = self.args index = self.index = T.lscalar() x = self.x = T.fmatrix() y = self.y = T.ivector() dropout = self.dropout = theano.shared( np.float64(args.dropout).astype("float32")) n_d = args.hidden_dim layers = self.layers = [] for i in xrange(args.depth): l = Layer(n_in=28 * 28 if i == 0 else n_d, n_out=n_d, activation=ReLU) layers.append(l) output_layer = self.output_layer = Layer(n_in=n_d, n_out=10, activation=softmax) h = x for l in layers: h = l.forward(h) h = apply_dropout(h, dropout) self.h_final = h # batch * 10 probs = self.probs = output_layer.forward(h) # batch preds = self.preds = T.argmax(probs, axis=1) err = self.err = T.mean(T.cast(T.neq(preds, y), dtype="float32")) # loss = self.loss = -T.mean(T.log(probs[T.arange(y.shape[0]), y])) #loss = self.loss = T.mean( T.nnet.categorical_crossentropy( # probs, # y # )) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost += T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost = loss + l2_cost print "cost.dtype", self.cost.dtype
def ready(self, args, train): # len * batch self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = T.matrix(dtype=theano.config.floatX) dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer(n_d=self.n_d, vocab=set(w for w in train)) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d)) activation = get_activation_by_name(args["activation"]) rnn_layer = LSTM(n_in=self.n_d, n_out=self.n_d, activation=activation) output_layer = Layer( n_in=self.n_d, n_out=self.n_V, activation=T.nnet.softmax, ) # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d)) # len * batch * (n_d+n_d) h = rnn_layer.forward_all(x, self.init_state, return_c=True) self.last_state = h[-1] h = h[:, :, self.n_d:] h = apply_dropout(h, self.dropout) self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys]) #self.nll = T.nnet.categorical_crossentropy( # self.p_y_given_x, # idys # ) self.layers = [embedding_layer, rnn_layer, output_layer] #self.params = [ x_flat ] + rnn_layer.params + output_layer.params self.params = embedding_layer.params + rnn_layer.params + output_layer.params self.num_params = sum( len(x.get_value(borrow=True).ravel()) for l in self.layers for x in l.params) say("# of params in total: {}\n".format(self.num_params))
def build_model(self): args = self.args weights = self.weights meta_emb = self.meta_emb = self.embs[0] golden_embs = self.embs[1:] n_m_d = meta_emb.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX)) batch_ids = self.batch_ids = T.ivector('batch_d_char') batch_masks = self.batch_masks = T.fmatrix('batch_d_char_mask') layers = self.layers = [meta_emb] slices_embs = meta_emb.forward(batch_ids.ravel()) slices_embs = slices_embs.reshape((batch_ids.shape[0], n_m_d)) prev_output = apply_dropout(slices_embs, dropout, v2=True) self.all_loss = 0.0 for i in range(len(weights)): mask, weight, golden_emb = batch_masks[i], weights[i], golden_embs[ i] n_o_d = golden_emb.n_d layer = Layer(n_m_d, n_o_d, linear) layers.append(layer) mapped_output = layer.forward(prev_output) slices_embs = golden_emb.forward(batch_ids.ravel()) slices_embs = slices_embs.reshape((batch_ids.shape[0], n_o_d)) self.all_loss += weight * T.sum( T.sum((mapped_output - slices_embs) * (mapped_output - slices_embs), axis=1) * mask) / (1e-8 + T.sum(mask)) for i, l in enumerate(layers[1:]): say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out)) self.l2_sqr = None self.params = [] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) self.all_loss += self.l2_sqr n_params = sum( len(x.get_value(borrow=True).ravel()) for x in self.params) say("total # parameters: {}\n".format(n_params))
class HighwayLayer(object): def __init__(self, n_d): self.n_d = n_d self.gate = Layer(n_d, n_d, sigmoid) def forward(self, x, h): t = self.gate.forward(x) return h * t + x * (1 - t) @property def params(self): return self.gate.params @params.setter def params(self, param_list): self.gate.params = param_list
def ready(self, args, train): # len * batch self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = T.matrix(dtype=theano.config.floatX) dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer( n_d = self.n_d, vocab = set(w for w in train) ) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format( self.n_V, self.n_d )) activation = get_activation_by_name(args["activation"]) rnn_layer = LSTM( n_in = self.n_d, n_out = self.n_d, activation = activation ) output_layer = Layer( n_in = self.n_d, n_out = self.n_V, activation = T.nnet.softmax, ) # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) x = x.reshape( (self.idxs.shape[0], self.idxs.shape[1], self.n_d) ) # len * batch * (n_d+n_d) h = rnn_layer.forward_all(x, self.init_state, return_c=True) self.last_state = h[-1] h = h[:,:,self.n_d:] h = apply_dropout(h, self.dropout) self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys]) #self.nll = T.nnet.categorical_crossentropy( # self.p_y_given_x, # idys # ) self.layers = [ embedding_layer, rnn_layer, output_layer ] #self.params = [ x_flat ] + rnn_layer.params + output_layer.params self.params = embedding_layer.params + rnn_layer.params + output_layer.params self.num_params = sum(len(x.get_value(borrow=True).ravel()) for l in self.layers for x in l.params) say("# of params in total: {}\n".format(self.num_params))
def ready(self): args = self.args index = self.index = T.lscalar() x = self.x = T.fmatrix() y = self.y = T.ivector() dropout = self.dropout = theano.shared(np.float64(args.dropout).astype( "float32")) n_d = args.hidden_dim layers = self.layers = [ ] for i in xrange(args.depth): l = Layer( n_in = 28*28 if i == 0 else n_d, n_out = n_d, activation = ReLU ) layers.append(l) output_layer = self.output_layer = Layer( n_in = n_d, n_out = 10, activation = softmax ) h = x for l in layers: h = l.forward(h) h = apply_dropout(h, dropout) self.h_final = h # batch * 10 probs = self.probs = output_layer.forward(h) # batch preds = self.preds = T.argmax(probs, axis=1) err = self.err = T.mean(T.cast(T.neq(preds, y), dtype="float32")) # loss = self.loss = -T.mean( T.log(probs[T.arange(y.shape[0]), y]) ) #loss = self.loss = T.mean( T.nnet.categorical_crossentropy( # probs, # y # )) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost += T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost = loss + l2_cost print "cost.dtype", self.cost.dtype
def ready(self, args, train): # len * batch depth = args["depth"] self.args = args self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = [ T.matrix(dtype=theano.config.floatX) for i in xrange(depth * 2) ] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype( theano.config.floatX) self.rnn_dropout = theano.shared(rnn_dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer(n_d=self.n_d, vocab=set(w for w in train)) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d)) activation = get_activation_by_name(args["activation"]) layers = self.layers = [] for i in xrange(depth): rnn_layer = KernelNN(n_in=self.n_d, n_out=self.n_d, activation=activation, highway=args["highway"], dropout=self.rnn_dropout) layers.append(rnn_layer) output_layer = Layer( n_in=self.n_d, n_out=self.n_V, activation=T.nnet.softmax, ) output_layer.W = embedding_layer.embeddings.T # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) #x = x_flat x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d)) # len * batch * (n_d+n_d) self.last_state = [] prev_h = x for i in xrange(depth): hidden = self.init_state[i * 2:i * 2 + 2] c, h = layers[i].forward_all(prev_h, hidden, return_c=True) self.last_state += [c[-1], h[-1]] prev_h = h prev_h = apply_dropout(prev_h, self.dropout) self.p_y_given_x = output_layer.forward(prev_h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idys) self.params = [x for l in layers for x in l.params] self.params += [embedding_layer.embeddings, output_layer.b] self.num_params = sum( len(x.get_value(borrow=True).ravel()) for x in self.params) say("# of params in total: {}\n".format(self.num_params)) layers += [embedding_layer, output_layer]