def build_model(tparams,options): trng = RandomStreams(options['SEED']) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentences, size of n_steps * n_samples x = tensor.matrix('x', dtype='int64') # the corresponding masks padding zeros mask = tensor.matrix('mask', dtype=config.floatX) # size of n_samples * n_z z = tensor.matrix('z', dtype=config.floatX) y = tensor.matrix('y', dtype=config.floatX) z = dropout(z, trng, use_noise) y = dropout(y, trng, use_noise) n_steps = x.shape[0] # the sentence length in this mini-batch n_samples = x.shape[1] # the number of sentences in this mini-batch n_x = tparams['Wemb'].shape[1] # the dimension of the word embedding # size of n_steps,n_samples,n_x emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x]) emb = dropout(emb, trng, use_noise) # 1 * n_samples * n_x z0 =tensor.dot(z,tparams['C0']).dimshuffle('x',0,1) # n_steps * n_samples * n_x emb_input = tensor.concatenate((z0,emb[:n_steps-1])) # n_steps * n_samples mask0 =mask[0].dimshuffle('x',0) mask_input = tensor.concatenate((mask0,mask[:n_steps-1])) # decoding the sentence vector z back into the original sentence h_decoder = encoder_layer(tparams, emb_input, mask_input,y, seq_output=True) h_decoder = dropout(h_decoder, trng, use_noise) shape = h_decoder.shape h_decoder = h_decoder.reshape((shape[0]*shape[1], shape[2])) Vhid = tensor.dot(tparams['Vhid'],tparams['Wemb'].T) pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid'] pred = tensor.nnet.softmax(pred_x) x_vec = x.reshape((shape[0]*shape[1],)) index = tensor.arange(shape[0]*shape[1]) pred_word = pred[index, x_vec] mask_word = mask.reshape((shape[0]*shape[1],)) index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0] pred_word = pred_word[index_list] # the cross-entropy loss cost = -tensor.log(pred_word + 1e-6).sum() / n_samples return use_noise, x, mask, y, z, cost
def build_model(tparams, options): trng = RandomStreams(options['SEED']) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # size of n_samples * n_z z = tensor.matrix('z', dtype=config.floatX) # size of n_samples * n_y y = tensor.matrix('y', dtype=config.floatX) z = dropout(z, trng, use_noise) h = tensor.tanh(tensor.dot(z, tparams['Wy1']) + tparams['by1']) h = dropout(h, trng, use_noise) # size of n_samples * n_y pred = tensor.nnet.sigmoid(tensor.dot(h, tparams['Wy2']) + tparams['by2']) f_pred = theano.function([z], pred, name='f_pred') cost = (-y * tensor.log(pred + 1e-6) - (1. - y) * tensor.log(1. - pred + 1e-6)).sum() / z.shape[0] return use_noise, z, y, cost, f_pred
def forward_alexnet(self, inp, weights, reuse=False): # reuse is for the normalization parameters. conv1 = conv_block(inp, weights['conv1_weights'], weights['conv1_biases'], stride_y=4, stride_x=4, groups=1, reuse=reuse, scope='conv1') norm1 = lrn(conv1, 2, 1e-05, 0.75) pool1 = max_pool(norm1, 3, 3, 2, 2, padding='VALID') # 2nd Layer: Conv (w ReLu) -> Lrn -> Pool with 2 groups conv2 = conv_block(pool1, weights['conv2_weights'], weights['conv2_biases'], stride_y=1, stride_x=1, groups=2, reuse=reuse, scope='conv2') norm2 = lrn(conv2, 2, 1e-05, 0.75) pool2 = max_pool(norm2, 3, 3, 2, 2, padding='VALID') # 3rd Layer: Conv (w ReLu) conv3 = conv_block(pool2, weights['conv3_weights'], weights['conv3_biases'], stride_y=1, stride_x=1, groups=1, reuse=reuse, scope='conv3') # 4th Layer: Conv (w ReLu) splitted into two groups conv4 = conv_block(conv3, weights['conv4_weights'], weights['conv4_biases'], stride_y=1, stride_x=1, groups=2, reuse=reuse, scope='conv4') # 5th Layer: Conv (w ReLu) -> Pool splitted into two groups conv5 = conv_block(conv4, weights['conv5_weights'], weights['conv5_biases'], stride_y=1, stride_x=1, groups=2, reuse=reuse, scope='conv5') pool5 = max_pool(conv5, 3, 3, 2, 2, padding='VALID') # 6th Layer: Flatten -> FC (w ReLu) -> Dropout flattened = tf.reshape(pool5, [-1, 6 * 6 * 256]) fc6 = fc(flattened, weights['fc6_weights'], weights['fc6_biases'], activation='relu') dropout6 = dropout(fc6, self.KEEP_PROB) # 7th Layer: FC (w ReLu) -> Dropout fc7 = fc(dropout6, weights['fc7_weights'], weights['fc7_biases'], activation='relu') dropout7 = dropout(fc7, self.KEEP_PROB) # 8th Layer: FC and return unscaled activations fc8 = fc(dropout7, weights['fc8_weights'], weights['fc8_biases']) return fc7, fc8
def graph(self, input, is_training): with tf.name_scope('model'): net = ut.conv_layer(input, 64, 7, 2, name='conv1') net = ut.bottleneck(net, 128, stride=1, training=is_training, name='res1') net = ut.max_pool(net, 2, 2, 'max_pool') net = ut.bottleneck(net, int(self.nFeats / 2), stride=1, training=is_training, name='res2') net = ut.bottleneck(net, self.nFeats, stride=1, training=is_training, name='res3') with tf.name_scope('stacks'): stack_out = [] with tf.name_scope('stage_0'): hg = ut.hourglass(net, self.nLow, self.nFeats, 'hourglass') drop = ut.dropout(hg, self.dropout_rate, is_training, 'dropout') ll = ut.conv_layer_bn(drop, self.nFeats, 1, 1, is_training) out = ut.conv_layer(ll, self.num_points, 1, 1, name='out') out_ = ut.conv_layer(out, self.nFeats, 1, 1, name='out_') sum_ = tf.add(net, out_, name='merge') stack_out.append(out) for i in range(1, self.nStacks): with tf.name_scope('stage_' + str(i)): hg = ut.hourglass(sum_, self.nLow, self.nFeats, 'hourglass') drop = ut.dropout(hg, self.dropout_rate, is_training, 'dropout') ll = ut.conv_layer_bn(drop, self.nFeats, 1, 1, is_training) out = ut.conv_layer(ll, self.num_points, 1, 1, name='out') out_ = ut.conv_layer(ll, self.nFeats, 1, 1, name='out_') sum_ = tf.add(sum_, out_, name='merge') stack_out.append(out) with tf.name_scope('upsampling'): net = ut.batch_norm(sum_, is_training) net = ut.conv_layer_bn(net, self.nFeats, 3, 1, is_training) up1 = ut.deconv_layer(net, self.num_points, 1, 2, name='up_1') net = ut.conv_layer_bn(up1, self.nFeats, 3, 1, is_training) up2 = ut.deconv_layer(net, self.num_points, 1, 2, name='up_2') return tf.stack(stack_out, axis=1, name='stack_out'), up1, up2
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentences, size of n_steps * n_samples x = tensor.matrix('x', dtype='int64') # the corresponding masks padding zeros mask = tensor.matrix('mask', dtype=config.floatX) # size of n_z * n_samples z = tensor.matrix('z', dtype=config.floatX) z = dropout(z, trng, use_noise) n_steps = x.shape[0] # the sentence length in this mini-batch n_samples = x.shape[1] # the number of sentences in this mini-batch n_x = tparams['Wemb'].shape[1] # the dimension of the word embedding emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x]) emb = dropout(emb, trng, use_noise) # decoding the sentence vector z back into the original sentence h_decoder = decoder_layer(tparams, emb, z, mask=mask) h_decoder = dropout(h_decoder, trng, use_noise) shape = h_decoder.shape h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2])) Vhid = tensor.dot(tparams['Vhid'], tparams['Wemb'].T) pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid'] pred = tensor.nnet.softmax(pred_x) x_vec = x.reshape((shape[0] * shape[1], )) index = tensor.arange(shape[0] * shape[1]) pred_word = pred[index, x_vec] mask_word = mask.reshape((shape[0] * shape[1], )) index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0] pred_word = pred_word[index_list] # the cross-entropy loss cost = -tensor.log(pred_word + 1e-6).sum() / n_samples f_pred_prob = theano.function([x, mask, z], pred_word, name='f_pred_prob') return use_noise, x, mask, z, f_pred_prob, cost
def backpropagate(self, X, Y, cost_function, hidden_layer_dropout=0.1, input_layer_dropout=0.1): assert X.shape == (self.batch_size, self.input_dim), '[Error] X shape is wrong' assert Y.shape == (self.batch_size, self.output_dim), '[Error] Y shape is wrong' outputs, derivatives = self.forward(X, trace=True) output = outputs[-1] cost_derivative = cost_function(output, Y.T, derivative=True) da = cost_derivative for k in range(len(self.layers) - 1, -1, -1): outputs[k] = dropout( outputs[k], hidden_layer_dropout if k > 0 else input_layer_dropout) assert da.shape == (self.layers[k][0], self.batch_size), '[Error] da shape is wrong' dW = np.dot(da, outputs[k].T) / float(self.batch_size) db = (np.sum(da, axis=1) / float(self.batch_size)).reshape( self.layers[k][0], 1) dW = np.hstack((dW, db)) if k > 0: dh = (np.sum(np.dot(self.weights[k][:, :-1].T, da), axis=1) / float(self.batch_size)).reshape(self.layers[k - 1][0], 1) da = dh * derivatives[k - 1] dW = self.backpropagation_type(k, dW) self.weights[k] += dW
def call(self, inputs): """ Args: inputs: it is a list of the tokens, masks, indices of clf tokens, and labels tokens shape = (number of choices * batch size, context length, 3) masks1 is the mask of the second paragraphs of the tokens shape = (number of choices * batch size, context length) masks2 is the mask of the second paragraphs of the predictions shape = (number of choices * batch size, context length) clf_ids is the list of indices of clf tokens shape = (number of choices * batch size) labels shape = (number of choices * batch size) Returns: lm_logits shape = (batch size, seq length, vocab size) lm_losses shape = () clf_losses shape = () """ tokens, masks1, masks2, clf_ids, labels = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4] embedding = self.embed(tokens) self.embed.we = dropout(self.embed.we, self.embd_pdrop, self.train) hidden = self.transform(embedding) lm_logits, lm_loss = self.lm(hidden, tokens, masks1, masks2) clf_loss = self.clf(hidden, clf_ids, labels) return lm_logits, lm_loss, clf_loss
def call(self, inputs): """ Args: inputs: it is a list of the ID and positions of the tokens and their mask. tokens shape = (batch size, context length, 3 (IDs and positions and segments)) masks shape = (batch size, context length) Returns: logits: shape = (batch size, context length, vocab size) losses: shape = () """ tokens, masks1, masks2 = inputs[0], inputs[1], inputs[2] embedding = self.embed(tokens) self.embed.we = dropout(self.embed.we, self.embd_pdrop, self.train) hidden = self.transform(embedding) hidden = tf.reshape(tf.boolean_mask(hidden, masks2), [-1, self.n_embd]) tokens = tf.reshape(tf.boolean_mask(tokens[:, :, 0], masks1), [-1]) logits = tf.reshape(tf.matmul(hidden, self.embed.we[:self.n_vocab + self.n_special, :], transpose_b=True), [-1, self.n_vocab + self.n_special]) eps = 1e-100 labels = tf.one_hot(tokens, self.n_vocab + self.n_special, 1 - (self.n_vocab - 1) * eps, eps) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels) loss = tf.reduce_mean(losses) return logits, loss
def residual_mlp_layer(x_flat, intermediate_size, initializer_range=0.02, hidden_dropout_prob=0.1): """ :param x: The attention output. It should be [batch_size*seq_length, dim] :param intermediate_size: the hidden projection. By default this is the input_dim * 4. in the original GPT we would return layer_norm(x_norm + h1) rather than layer_norm(x + h1) :return: """ batch_size_seq_length, hidden_size = get_shape_list(x_flat, expected_rank=2) x_norm = layer_norm(x_flat, name='mlp_ln0') intermediate_output = tf.layers.dense( x_norm, intermediate_size, activation=gelu, kernel_initializer=create_initializer(initializer_range), name='intermediate', ) output_for_residual = tf.layers.dense( intermediate_output, hidden_size, name='output', kernel_initializer=create_initializer(initializer_range)) output_for_residual = dropout(output_for_residual, hidden_dropout_prob) layer_output = layer_norm(x_flat + output_for_residual, name='mlp_ln1') return layer_output
def call(self, inputs): """ Args: inputs: it is list of the ID and positions of the tokens and their mask. tokens shape = (batch size, context length, 2 (IDs and positions)) masks shape = (batch size, context length) Returns: logits: shape = (batch size, context length, vocab size) losses: shape = (batch size, ) """ tokens = tf.reshape(inputs[0], [-1, self.n_ctx, 2]) masks = tf.reshape(inputs[1], (-1, self.n_ctx)) masks1 = tf.slice(masks, [0, 1], [-1, self.n_ctx - 1]) masks2 = tf.pad(masks1, [[0, 0], [0, 1]]) masks1 = tf.pad(masks1, [[0, 0], [1, 0]]) embedding = self.embed(tokens) self.embed.we = dropout(self.embed.we, self.embd_pdrop, self.train) hidden = self.transform(embedding) hidden = tf.reshape(hidden, [-1, self.n_ctx, self.n_embd]) hidden = tf.reshape(tf.boolean_mask(hidden, masks2), [-1, self.n_embd]) tokens = tf.reshape(tf.boolean_mask(tokens[:, :, 0], masks1), [-1]) logits = tf.reshape( tf.matmul(hidden, self.embed.we[:self.n_vocab, :], transpose_b=True), [-1, self.n_vocab]) losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tokens) losses = tf.reduce_mean(losses) return logits, losses
def prediction_save_cache(self, x): """ Compute prediction for the fully-connected net and save intermediate activations. N samples, D dims per sample, each sample is a row vec, M is the dims of y/prediction Input: x: A numpy array of input data, shape (N, D) Return: output: Output prediction/prediction of label, shape (N, M) caches: Saved intermediate activations for use in backprop """ caches = {} h = x # Input into the next layer or previous hidden activation for l in range(self.n_hidden): l = str(l) w, b = self.params["w" + l], self.params["b" + l] h, caches["affine" + l] = affine(h, w, b) # Affine layer h, caches["relu" + l] = relu(h) # Activation (ReLU) # Dropout layer (train-time dropout) h, caches["dropout" + l] = dropout(h, self.dropout) # Output layer, simply an affine output, cache = affine(h, self.params["w_out"], self.params["b_out"]) caches["affine_out"] = cache return output, caches
def _process_layers(self, weights, data, learning=True): for W, mean, std in self._generate_layers(weights): data = normalize(data, mean, std) data = relu(data) if learning and self.dropout is not None: data = dropout(data, self.dropout) data = np.dot(data, W) return data
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentence: n_steps * n_samples x = tensor.matrix('x', dtype='int32') # label: (n_samples,) y = tensor.vector('y', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], tparams['Wemb'].shape[1])) layer0_input = dropout(layer0_input, trng, use_noise) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs, 1) layer1_input = dropout(layer1_input, trng, use_noise) # this is the label prediction you made pred = tensor.nnet.softmax( tensor.dot(layer1_input, tparams['Wy']) + tparams['by']) f_pred_prob = theano.function([x], pred, name='f_pred_prob') f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred') # get the expression of how we calculate the cost function # i.e. corss-entropy loss index = tensor.arange(x.shape[0]) cost = -tensor.log(pred[index, y] + 1e-6).mean() return use_noise, x, y, f_pred_prob, f_pred, cost
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # n_samples * n_chars x = tensor.matrix('x', dtype='int32') y = tensor.matrix('y', dtype='int32') # (ncons*n_samples) * n_chars cy = tensor.matrix('cy', dtype='int32') # n_samples * n_h tmp_x = tensor.tanh(tensor.dot(x, tparams['W1']) + tparams['b1']) tmp_y = tensor.tanh(tensor.dot(y, tparams['W1']) + tparams['b1']) # (ncons*n_samples) * n_h tmp_cy = tensor.tanh(tensor.dot(cy, tparams['W1']) + tparams['b1']) # n_samples * n_h feats_x = tensor.tanh(tensor.dot(tmp_x, tparams['W2']) + tparams['b2']) feats_y = tensor.tanh(tensor.dot(tmp_y, tparams['W2']) + tparams['b2']) # (ncons*n_samples) * n_h feats_cy = tensor.tanh(tensor.dot(tmp_cy, tparams['W2']) + tparams['b2']) feats_x = dropout(feats_x, trng, use_noise) feats_y = dropout(feats_y, trng, use_noise) feats_cy = dropout(feats_cy, trng, use_noise) feats_x = l2norm(feats_x) feats_y = l2norm(feats_y) feats_cy = l2norm(feats_cy) # Tile by number of contrast terms # (ncon*n_samples) * n_h feats_x = tensor.tile(feats_x, (options['ncon'], 1)) feats_y = tensor.tile(feats_y, (options['ncon'], 1)) cost = tensor.log(1 + tensor.sum( tensor.exp(-options['gamma'] * ((feats_x * feats_y).sum(axis=1) - (feats_x * feats_cy).sum(axis=1))))) return use_noise, [x, y, cy], cost
def _attn(self, q, k, v): w = tf.matmul(q, k) if self.scale: n_state = shape_list(v)[-1] w = w * tf.rsqrt(tf.cast(n_state, tf.float32)) w = self.mask_attn_weights(w) w = tf.nn.softmax(w) w = dropout(w, self.attn_pdrop, self.train) a = tf.matmul(w, v) return a
def clf(self, hidden, clf_ids, labels): clf_hidden = tf.reshape(tf.gather_nd(hidden, clf_ids), [-1, self.n_embd]) clf_logits = self.classifier(clf_hidden) clf_logits = dropout(clf_logits, self.clf_pdrop, self.train) clf_logits = tf.reshape(clf_logits, [-1, 2]) eps = 1e-100 labels = tf.one_hot(labels, 2, 1 - eps, eps) clf_losses = tf.nn.softmax_cross_entropy_with_logits(logits=clf_logits, labels=labels) clf_loss = tf.reduce_mean(clf_losses) return clf_loss
def build_multi_dynamic_brnn(args, maxTimeSteps, inputX, cell_fn, seqLengths, time_major=True): hid_input = inputX # shape=(maxTimeSteps, args.batch_size, args.num_feature) for i in range(args.num_layer): scope = 'DBRNN_' + str(i + 1) forward_cell = cell_fn(args.num_hidden, activation=args.activation) backward_cell = cell_fn(args.num_hidden, activation=args.activation) # tensor of shape: [max_time, batch_size, input_size] outputs, output_states = bidirectional_dynamic_rnn( forward_cell, backward_cell, inputs=hid_input, dtype=tf.float32, sequence_length=seqLengths, time_major=True, scope=scope) # forward output, backward output # tensor of shape: [max_time, batch_size, input_size] output_fw, output_bw = outputs # forward states, backward states output_state_fw, output_state_bw = output_states # output_fb = tf.concat(2, [output_fw, output_bw]) output_fb = tf.concat( [output_fw, output_bw], 2) # 连接两个矩阵的操作 [max_time, batch_size, input_size*2]?? shape = output_fb.get_shape().as_list() output_fb = tf.reshape( output_fb, [shape[0], shape[1], 2, int(shape[2] / 2)]) # 第四维度表示取输出结果均值 hidden = tf.reduce_sum(output_fb, 2) # 得到第三维度上相加的值,代表了什么????? hidden = dropout(hidden, args.keep_prob, (args.mode == 'train')) if i != (args.num_layer - 1): hid_input = hidden else: outputXrs = tf.reshape( hidden, [-1, args.num_hidden]) # reshape(tensor,shape,name=None) # -1代表把其他维度flatten成一维,应该是生成了什么?[ ?, num_hidden ] 只知道其中一个维度代表num_hidden # output_list = tf.split(0, maxTimeSteps, outputXrs) output_list = tf.split(outputXrs, maxTimeSteps, 0) # 将outputXrs分成maxTmeSteps份, fbHrs = [ tf.reshape(t, [args.batch_size, args.num_hidden]) for t in output_list ] # 把每一时刻的tensor分成 # [batch_size, num_hidden]大小,并组成以时间为轴的列表 return fbHrs
def build_model(tparams,options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentence: n_steps * n_samples x = tensor.matrix('x', dtype='int32') mask = tensor.matrix('mask', dtype=config.floatX) # label: (n_samples,) y = tensor.vector('y',dtype='int32') n_steps = x.shape[0] # the length of the longest sentence in this minibatch n_samples = x.shape[1] # how many samples we have in this minibatch n_x = tparams['Wemb'].shape[1] # the dimension of the word-embedding emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x]) emb = dropout(emb, trng, use_noise) # encoding of the sentence, size of n_samples * n_h h_encoder = encoder(tparams, emb, mask=mask, prefix='lstm_encoder') h_encoder_rev = encoder(tparams, emb[::-1], mask=mask[::-1], prefix='lstm_encoder_rev') # size of n_samples * (2*n_h) z = tensor.concatenate((h_encoder,h_encoder_rev),axis=1) z = dropout(z, trng, use_noise) # this is the label prediction you made # size of n_samples * n_y pred = tensor.nnet.softmax(tensor.dot(z, tparams['Wy'])+tparams['by']) f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') # get the expression of how we calculate the cost function # i.e. corss-entropy loss index = tensor.arange(n_samples) cost = -tensor.log(pred[index, y] + 1e-6).mean() return use_noise, x, mask, y, f_pred_prob, f_pred, cost
def call(self, inputs): c = self.conv1d_c(inputs) q, k, v = tf.split(c, 3, 2) q = self.split_heads(q, self.n_head) k = self.split_heads(k, self.n_head, k=True) v = self.split_heads(v, self.n_head) a = self._attn(q, k, v) a = self.merge_heads(a) a = self.conv1d_a(a) a = dropout(a, self.resid_pdrop, self.train) return a
def forward_fc(self, inp, weights, reuse=False, is_training=False): # reuse is for the normalization parameters. x = tf.reshape(inp, [-1, 512]) dense1 = fc(x, weights['dense1_weights'], weights['dense1_biases'], activation=None) bn1 = tf.layers.batch_normalization(dense1, momentum=0.99, training=is_training, name='bn1', reuse=tf.AUTO_REUSE) relu1 = tf.nn.relu(bn1) dropout1 = dropout(relu1, self.KEEP_PROB) dense2 = fc(dropout1, weights['dense2_weights'], weights['dense2_biases'], activation=None) bn2 = tf.layers.batch_normalization(dense2, momentum=0.99, training=is_training, name='bn2', reuse=tf.AUTO_REUSE) relu2 = tf.nn.relu(bn2) dropout2 = dropout(relu2, self.KEEP_PROB) dense3 = fc(dropout2, weights['dense3_weights'], weights['dense3_biases'], activation=None) bn3 = tf.layers.batch_normalization(dense3, momentum=0.99, training=is_training, name='bn3', reuse=tf.AUTO_REUSE) relu3 = tf.nn.relu(bn3) if self.loss_func == self.additive_angular_margin_softmax: return dense2, bn3 # last_layer_linear for angular softmax elif self.loss_func == self.softmax: return dense2, relu3
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # x: n_steps * n_samples x = tensor.matrix('x', dtype='int64') y = tensor.matrix('y', dtype='int64') n_steps = x.shape[0] n_samples = x.shape[1] n_x = tparams['Wemb'].shape[1] emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x]) emb = dropout(emb, trng, use_noise) h_decoder = decoder_layer(tparams, emb, prefix='decoder_h1') h_decoder = dropout(h_decoder, trng, use_noise) h_decoder = decoder_layer(tparams, h_decoder, prefix='decoder_h2') h_decoder = dropout(h_decoder, trng, use_noise) # n_steps * n_samples * n_h shape = h_decoder.shape h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2])) pred = tensor.dot(h_decoder, tparams['Vhid']) + tparams['bhid'] pred = tensor.nnet.softmax(pred) y_vec = y.reshape((shape[0] * shape[1], )) index = tensor.arange(shape[0] * shape[1]) y_pred = pred[index, y_vec] f_pred_prob = theano.function([x, y], y_pred, name='f_pred_prob') cost = -tensor.log(y_pred + 1e-6).sum() / n_steps / n_samples return use_noise, x, y, f_pred_prob, cost
def forward_fc(self, inp, weights, reuse=False, is_training=False): # reuse is for the normalization parameters. x = tf.reshape(inp, [-1, 512]) dense1 = fc(x, weights['dense1_weights'], weights['dense1_biases'], activation=None) bn1 = tf.layers.batch_normalization(dense1, momentum=0.99, training=is_training, name='bn1', reuse=tf.AUTO_REUSE) relu1 = tf.nn.relu(bn1) dropout1 = dropout(relu1, self.KEEP_PROB) dense2 = fc(dropout1, weights['dense2_weights'], weights['dense2_biases'], activation=None) bn2 = tf.layers.batch_normalization(dense2, momentum=0.99, training=is_training, name='bn2', reuse=tf.AUTO_REUSE) relu2 = tf.nn.relu(bn2) dropout2 = dropout(relu2, self.KEEP_PROB) dense3 = fc(dropout2, weights['dense3_weights'], weights['dense3_biases'], activation=None) bn3 = tf.layers.batch_normalization(dense3, momentum=0.99, training=is_training, name='bn3', reuse=tf.AUTO_REUSE) return dense1, bn3
def build_model(tparams,options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # input sentence: n_steps * n_samples x = tensor.matrix('x', dtype='int32') # label: (n_samples,) y = tensor.vector('y',dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),dtype='int32')].reshape((x.shape[0],1,x.shape[1],tparams['Wemb'].shape[1])) layer0_input = dropout(layer0_input, trng, use_noise) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input,filter_shape=filter_shape, pool_size=pool_size,prefix=_p('cnn_encoder',i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs,1) layer1_input = dropout(layer1_input, trng, use_noise) # this is the label prediction you made pred = tensor.nnet.softmax(tensor.dot(layer1_input, tparams['Wy']) + tparams['by']) f_pred_prob = theano.function([x], pred, name='f_pred_prob') f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred') # get the expression of how we calculate the cost function # i.e. corss-entropy loss index = tensor.arange(x.shape[0]) cost = -tensor.log(pred[index, y] + 1e-6).mean() return use_noise, x, y, f_pred_prob, f_pred, cost
def build_multi_dynamic_brnn(args, maxTimeSteps, inputX, cell_fn, seqLengths, time_major=True): hid_input = inputX for i in range(args.num_layer): scope = 'DBRNN_' + str(i + 1) forward_cell = cell_fn(args.num_hidden, activation=args.activation) backward_cell = cell_fn(args.num_hidden, activation=args.activation) # tensor of shape: [max_time, batch_size, input_size] outputs, output_states = bidirectional_dynamic_rnn( forward_cell, backward_cell, inputs=hid_input, dtype=tf.float32, sequence_length=seqLengths, time_major=True, scope=scope) # forward output, backward ouput # tensor of shape: [max_time, batch_size, input_size] output_fw, output_bw = outputs # forward states, backward states output_state_fw, output_state_bw = output_states # output_fb = tf.concat(2, [output_fw, output_bw]) output_fb = tf.concat([output_fw, output_bw], 2) shape = output_fb.get_shape().as_list() output_fb = tf.reshape( output_fb, [shape[0], shape[1], 2, int(shape[2] / 2)]) hidden = tf.reduce_sum(output_fb, 2) hidden = dropout(hidden, args.keep_prob, (args.mode == 'train')) if i != args.num_layer - 1: hid_input = hidden else: outputXrs = tf.reshape(hidden, [-1, args.num_hidden]) # output_list = tf.split(0, maxTimeSteps, outputXrs) output_list = tf.split(outputXrs, maxTimeSteps, 0) fbHrs = [ tf.reshape(t, [args.batch_size, args.num_hidden]) for t in output_list ] return fbHrs
def attention_func(input_tensor, attention_mask, hidden_size, hidden_dropout_prob, num_attention_heads, attention_head_size, attention_probs_dropout_prob, initializer_range, batch_size, seq_length): attention_heads = [] with tf.variable_scope("attention") as scope: with tf.variable_scope("self"): attention_head = attention_layer( from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=utils.create_initializer(initializer_range)) attention_output = utils.dropout(attention_output, hidden_dropout_prob) attention_output = utils.layer_norm(attention_output + input_tensor) return attention_output, scope
def feedforward_func(input_tensor, intermediate_size, initializer_range, hidden_size, hidden_dropout_prob, intermediate_act_fn=utils.gelu): # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("feedforward") as scope: intermediate_output = tf.layers.dense( input_tensor, intermediate_size, activation=intermediate_act_fn, kernel_initializer=utils.create_initializer(initializer_range), name="intermediate_dense") # Down-project back to `hidden_size` then add the residual. layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=utils.create_initializer(initializer_range), name="intermediate_output") layer_output = utils.dropout(layer_output, hidden_dropout_prob) layer_output = utils.layer_norm(layer_output + input_tensor) return layer_output, scope
def net_1(input, is_train): conv1 = conv(input, filter_h=5, filter_w=5, num_filters=32, stride_y=1, stride_x=1, name='conv1') pool1 = max_pool(conv1, filter_h=2, filter_w=2, stride_y=2, stride_x=2, name='pool1') conv2 = conv(pool1, 5, 5, 64, 1, 1, 'conv2') pool2 = max_pool(conv2, 2, 2, 2, 2, 'pool2') flattened = flatten_3d(pool2, name='flattening') fc3 = fc(flattened, out_neurons=1000, name='fc3') dropout3 = dropout(fc3, keep_prob=prob_close(is_train, 0.5), name='dropout3') fc4 = fc(dropout3, out_neurons=10, name='fc4', relu=False) return fc4
def train_functions(model, datasets, batch_size, learning_rate, annealing_learning_rate, l1_learning_rate, l2_learning_rate, dropout_rate=None, noise_rate=None): """ Generates a function `train` that implements one step of fine-tuning, a function `validate` that computes the error on a batch from the validation set and a function `test` that computes the error on a batch from the testing set :type datasets: Theano shred variable :param datasets: Dataset with train, test and valid sets :type batch_size: int :param batch_size: Size of the batch for train :type learning_rate: float :param learning_rate: learning rate :type annealing_learning_rate: float :param annealing_learning_rate: decreasing rate of learning rate type l1_learning_rate: float :param l1_learning_rate: L1-norm's weight when added to the cost :type l2_learning_rate: float :param l2_learning_rate: L2-norm's weight when added to the cost """ train_set_x, train_set_y = datasets['train_set'] y = T.matrix('y') index = T.lscalar() # compiling a Theano function that computes the mistakes that are made by the model on a mini batch test_model = theano.function( inputs=[model.input, y], outputs=error_function(model, y) ) validate_model = theano.function( inputs=[model.input, y], outputs=error_function(model, y) ) # the cost we minimize during training is the model cost of plus the regularization terms (L1 and L2) loss_function = ( cost_function(model, y) + l1_learning_rate * model.L1 + l2_learning_rate * model.L2 ) # compute the gradient of cost with respect params gparams = [T.grad(loss_function, param) for param in model.params] ################################################# # Wudi change the annealing learning rate: ################################################# updates = [] state_learning_rate = theano.shared( numpy.asarray( learning_rate, dtype=theano.config.floatX ), borrow=True) updates.append((state_learning_rate, annealing_learning_rate * state_learning_rate)) # compute list of fine-tuning updates for param, gparam in zip(model.params, gparams): updates.append((param, param - state_learning_rate * gparam)) model_input = train_set_x[index * batch_size: (index + 1) * batch_size] if noise_rate is not None: model_input = utils.add_gaussian(input=model_input, noise_level=noise_rate) if dropout_rate is not None: model_input = utils.dropout(input=model_input, noise_level=dropout_rate, rescale=True) train_model = theano.function( inputs=[index], outputs=loss_function, updates=updates, givens={ model.input: model_input, y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # theano.printing.pydotprint(train_model, outfile="s.png", var_with_name_simple=True) return train_model, test_model, validate_model
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on "Attention is all you Need". If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a "query" tensor and `to_tensor` into "key" and "value" tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with transposes and reshapes rather than actual separate tensors. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]. If False, the output will be of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is true, this will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]). Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor from_shape = utils.get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = utils.get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` from_tensor_2d = utils.reshape_to_matrix(from_tensor) to_tensor_2d = utils.reshape_to_matrix(to_tensor) # `query_layer` = [B*F, N*H] query_layer = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name="query", kernel_initializer=utils.create_initializer(initializer_range)) # `key_layer` = [B*T, N*H] key_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name="key", kernel_initializer=utils.create_initializer(initializer_range)) # `value_layer` = [B*T, N*H] value_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name="value", kernel_initializer=utils.create_initializer(initializer_range)) # `query_layer` = [B, N, F, H] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) # `key_layer` = [B, N, T, H] key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) # Take the dot product between "query" and "key" to get the raw # attention scores. # `attention_scores` = [B, N, F, T] attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = utils.dropout(attention_probs, attention_probs_dropout_prob) # `value_layer` = [B, T, N, H] value_layer = tf.reshape( value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]) # `value_layer` = [B, N, T, H] value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) # `context_layer` = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer) # `context_layer` = [B, F, N, H] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: # `context_layer` = [B*F, N*H] context_layer = tf.reshape(context_layer, [ batch_size * from_seq_length, num_attention_heads * size_per_head ]) else: # `context_layer` = [B, F, N*H] context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) return context_layer
def call(self, inputs): hidden1 = self.act(self.conv_fc(inputs)) hidden2 = self.conv_proj(hidden1) hidden2 = dropout(hidden2, self.resid_pdrop, self.train) return hidden2
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # description string: n_steps * n_samples x = tensor.matrix('x', dtype='int32') x_mask = tensor.matrix('x_mask', dtype=config.floatX) y = tensor.matrix('y', dtype='int32') y_mask = tensor.matrix('y_mask', dtype=config.floatX) n_steps_x = x.shape[0] n_steps_y = y.shape[0] n_samples = x.shape[1] n_x = tparams['Wemb'].shape[1] # n_steps * n_samples * n_x x_emb = tparams['Wemb'][x.flatten()].reshape([n_steps_x, n_samples, n_x]) y_emb = tparams['Wemb'][y.flatten()].reshape([n_steps_y, n_samples, n_x]) # n_samples * n_h h_emb_f_x = encoder(tparams, x_emb, mask=x_mask, prefix='encoder_f') h_emb_b_x = encoder(tparams, x_emb[::-1], mask=x_mask[::-1], prefix='encoder_b') h_emb_f_y = encoder(tparams, y_emb, mask=y_mask, prefix='encoder_f') h_emb_b_y = encoder(tparams, y_emb[::-1], mask=y_mask[::-1], prefix='encoder_b') # n_samples * (2*n_h) h_emb_x = tensor.concatenate((h_emb_f_x, h_emb_b_x), axis=1) h_emb_y = tensor.concatenate((h_emb_f_y, h_emb_b_y), axis=1) h_emb_x = dropout(h_emb_x, trng, use_noise) h_emb_y = dropout(h_emb_y, trng, use_noise) h_emb_x = l2norm(h_emb_x) h_emb_y = l2norm(h_emb_y) # contrastive strings # description string: n_steps * (ncon*n_samples) cy = tensor.matrix('cy', dtype='int32') cy_mask = tensor.matrix('cy_mask', dtype=config.floatX) n_steps_cy = cy.shape[0] n_samples_c = cy.shape[1] # n_steps * (ncon*n_samples) * n_x cy_emb = tparams['Wemb'][cy.flatten()].reshape( [n_steps_cy, n_samples_c, n_x]) # (ncon*n_samples) * n_h h_emb_f_cy = encoder(tparams, cy_emb, mask=cy_mask, prefix='encoder_f') h_emb_b_cy = encoder(tparams, cy_emb[::-1], mask=cy_mask[::-1], prefix='encoder_b') # (ncon*n_samples) * (2*n_h) h_emb_cy = tensor.concatenate((h_emb_f_cy, h_emb_b_cy), axis=1) h_emb_cy = dropout(h_emb_cy, trng, use_noise) h_emb_cy = l2norm(h_emb_cy) # Tile by number of contrast terms # (ncon*n_samples) * (2*n_h) h_emb_x = tensor.tile(h_emb_x, (options['ncon'], 1)) h_emb_y = tensor.tile(h_emb_y, (options['ncon'], 1)) cost = tensor.log(1 + tensor.sum( tensor.exp(-options['gamma'] * ((h_emb_x * h_emb_y).sum(axis=1) - (h_emb_x * h_emb_cy).sum(axis=1))))) return use_noise, [x, x_mask, y, y_mask, cy, cy_mask], cost
def build_model(tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) x = tensor.matrix('x', dtype='int32') y = tensor.matrix('y', dtype='int32') cy = tensor.matrix('cy', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input_x = tensor.concatenate(layer1_inputs, 1) layer1_input_x = dropout(layer1_input_x, trng, use_noise) layer0_input = tparams['Wemb'][tensor.cast(y.flatten(), dtype='int32')].reshape( (y.shape[0], 1, y.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input_y = tensor.concatenate(layer1_inputs, 1) layer1_input_y = dropout(layer1_input_y, trng, use_noise) layer0_input = tparams['Wemb'][tensor.cast( cy.flatten(), dtype='int32')].reshape( (cy.shape[0], 1, cy.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input_cy = tensor.concatenate(layer1_inputs, 1) layer1_input_cy = dropout(layer1_input_cy, trng, use_noise) layer1_input_x = l2norm(layer1_input_x) layer1_input_y = l2norm(layer1_input_y) layer1_input_cy = l2norm(layer1_input_cy) # Tile by number of contrast terms # (ncon*n_samples) * (2*n_h) layer1_input_x = tensor.tile(layer1_input_x, (options['ncon'], 1)) layer1_input_y = tensor.tile(layer1_input_y, (options['ncon'], 1)) cost = tensor.log(1 + tensor.sum( tensor.exp(-options['gamma'] * ((layer1_input_x * layer1_input_y).sum(axis=1) - (layer1_input_x * layer1_input_cy).sum(axis=1))))) return use_noise, [x, y, cy], cost