def generater(self, z, y=None): if y: yb = tf.reshape(y, [None, 1, 1, self.y_dim]) z = tf.concat(1, [z, y]) h0 = tf.nn.relu(bn0(linear(z, self.gfc_dim))) h0 = tf.concat(1, [h0, y]) h1 = tf.nn.relu(bn1(linear(z, self.gf_dim*2*7*7))) h1 = tf.reshape(h1, [None, 7, 7, self.gf_dim * 2]) h1 = conv_cond_concat(h1, yb) h2 = tf.nn.relu(bn2(deconv2d(h1, self.gf_dim, name='h2'))) h2 = conv_cond_concat(h2, yb) return tf.nn.sigmoid(deconv2d(h2, self.c_dim, name='h3')) else: h0 = tf.nn.relu(bn0(linear(z, self.gf_dim*8*4*4))) h0 = tf.reshape(h1, [None, 4, 4, self.gf_dim * 8]) h1 = deconv2d(h0, self.gf_dim*4, name='h1') h1 = tf.relu(bn1(h1)) h2 = deconv2d(h1, self.gf_dim*2, name='h2') h2 = tf.relu(bn2(h2)) h3 = deconv2d(h2, self.gf_dim*1, name='h3') h3 = tf.relu(bn3(h3)) h4 = deconv2d(h3, 3, name='h4') return tf.nn.tanh(h4)
def alphgo(_x, _weights, _biases, _dropout): _x = _x.reshape([-1, 19, 19, 1]) # convolution layer conv1 = tf.relu(conv2d(_x, _weights["conv1"]) + _biases["conv1"]) pool1 = max_pool(conv1, k=2) norm1 = norm(pool1, lsize=4) norm1 = tf.nn.dropout(norm1, _dropout) # conv1 image show tf.image_summary(conv1) conv2 = tf.relu(conv2d(norm1, _weights["conv2"]) + _biases["conv2"]) pool2 = max_pool(conv2, k=2) norm2 = norm(pool2, lsize=4) norm2 = tf.nn.dropout(norm2, _dropout) conv3 = tf.relu(conv2d(norm2, _weights["conv3"]) + _biases["conv3"]) pool3 = max_pool(conv3, k=2) norm3 = norm(pool3, lsize=4) norm3 = tf.nn.dropout(norm3, _dropout) # fully connect layer dense1 = tf.reshape(norm3, [-1, 4 * 4 * 1024]) dense1 = tf.nn.relu(tf.matmul(dense1, _weights["d1"]) + _biases["d1"]) dense2 = tf.nn.relu(tf.matmul(dense1, _weights["d2"]) + _biases["d2"]) out = tf.matmul(dense2, _weights["out"]) + _biases["out"] return out
def act_mrelu(net, mrelu): """Check this works """ net2 = mrelu["mult"] * (mrelu["addi"] + net) net2 = -tf.relu(net2) out_1 = tf.math.reduce_sum(net2) out = net - tf.relu(out_1) return out
def tf_ReLU_lin_grad(input_tensor): y = tf.relu(input_tensor) def grad(dy): return tf.identity(dy) return y, grad
def apply_nonlin(self, x): if self.nonlin_type == 'lrelu': return tf.relu(x, leak=.01) elif self.nonlin_type == 'tanh': return tf.tanh(x) else: raise NotImplementedError(self.nonlin_type)
def dynamicRNN(x, seqlen): # Prepare data shape to match `rnn` function requirements # Current data input shape: (batch_size, n_steps, n_input) # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) # Permuting batch_size and n_steps #x = tf.transpose(x, [1, 0, 2]) # Reshaping to (n_steps*batch_size, n_input) #x = tf.reshape(x, [-1, maximum_words_in_sentences]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) x = tf.split(1, maximum_words_in_sentences, x) x = [tf.squeeze(x_, [1]) for x_ in x] # Define a lstm cell with tensorflow lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1) tm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob= 0.25) # cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * 2) # calculation. TODO:implement 2-layer outputs, states = tf.nn.rnn(tm_cell, x, dtype=tf.float32, sequence_length=seqlen) #batch_size_=outputs[0].get_shape()[0] #batch_size_=tf.cast(batch_size_, int) #outputs=tf.reduce_mean(outputs,0) outputs=tf.split(1, batch_size, outputs) outputs=[tf.reshape(output,[-1,n_hidden]) for output in outputs] #print outputs[0].get_shape() for i in range(batch_size): outputs[i]=tf.nn.xw_plus_b(outputs[i],weights['out'],biases['out'],name="linear") outputs[i]=tf.relu(outputs[i]) outputs=tf.pack(outputs)#the [1] with length of batch_size becomes [0] now outputs=tf.reduce_mean(outputs,1)#change to 1 accordingly return outputs
def __init__(self, max_words, num_classes, vocab_size, embedding_size, num_hidden): # input, output, dropout placeholders self.text = tf.placeholder(tf.int32, [None, max_words], name="input_text") self.extra = tf.placeholder(tf.int32, [None, max_words], name="input_extra") self.output = tf.placeholder(tf.float32, [None, num_classes], name="output_y") self.sequence_lengths = tf.placeholder(tf.int32, [None], name="sequence_lengths") self.dropout_prob = tf.placeholder(tf.float32, name="dropout_probability") # Word embedding layer with tf.device("/cpu:0"), tf.name_scope("word_embedding"): embedding_matrix = tf.Variable( tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), # random numbers between -1 and 1 name="embedding_matrix") self.lookup = tf.nn.embedding_lookup(embedding_matrix, self.text) # GRU with tf.name_scope("GRU"): output, state = rnn.dynamic_rnn( rnn_cell.GRUCell(num_hidden), self.lookup, dtype=tf.float32, sequence_length=self.sequence_lengths) output = tf.transpose(output, [1, 0, 2]) self.gru = tf.gather(output, int(output.get_shape()[0]) - 1) # Add dropout with tf.name_scope("dropout"): self.dropout = tf.nn.dropout(self.gru, self.dropout_prob) # add in extra data and relu layer with tf.name_scope("extra_data"): combined = tf.concat(1, [self.dropout, self.extra]) weights_e = tf.Variable(tf.truncated_normal([num_hidden, num_hidden], stddev=0.1), name="weights_extra") biases_e = tf.Variable(tf.constant(0.1, shape=[num_hidden]), name="biases_extra") processed = tf.relu(tf.matmul(combined, weights_e) + biases_e) # Final output with tf.name_scope("output"): weights = tf.Variable(tf.truncated_normal([num_hidden, num_classes], stddev=0.1), name="weights") biases = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="biases") unscaled = tf.matmul(processed, weights) + biases self.scores = tf.nn.softmax(unscaled, name="scores") self.predictions = tf.argmax(self.scores, dimension=1, name="predictions") # calculate loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits(unscaled, self.output) self.loss = tf.reduce_mean(losses) # calculate accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.output, 1)) self.accuracy = 100 * tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [tf.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay: lr = lr * (1. / (1. + self.decay * tf.cast(self.iterations, tf.dtype(self.decay)))) t = tf.cast(self.iterations, tf.float32) + 1. beta_1 = self.beta_1 beta_2 = self.beta_2 beta_1_t = tf.pow(beta_1, t) beta_2_t = tf.pow(beta_2, t) rho_inf = 2. / (1. - beta_2) - 1. rho_t = rho_inf - 2. * t * beta_2_t / (1. - beta_2_t) r_t = tf.math.sqrt( tf.relu(rho_t - 4.) * (rho_t - 2.) * rho_inf / (tf.relu(rho_inf - 4.) * (rho_inf - 2.) * rho_t)) flag = tf.cast(rho_t > 4., tf.float32) ms = [tf.zeros(tf.int_shape(p)) for p in params] vs = [tf.zeros(tf.int_shape(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = beta_1 * m + (1. - beta_1) * g v_t = beta_2 * v + (1. - beta_2) * tf.square(g) m_hat_t = m_t / (1. - beta_1_t) v_hat_t = K.sqrt(v_t / (1. - beta_2_t)) new_p = p - lr * (r_t / (v_hat_t + self.epsilon) + flag - 1.) * m_hat_t if getattr(p, "constraint", None) is not None: new_p = p.constraint(new_p) self.updates.append(tf.update(p, new_p)) self.updates.append(tf.update(m, m_t)) self.updates.append(tf.update(v, v_t)) return self.updates
def build_layers(s, c_names, n_l1, w_initializer, b_initializer): with tf.variable_scope('l1'): # [1,n_features]*[n_features,n_l1] w1 = tf.get_variable( 'w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable( 'b1', [1, self.n_actions], initializer=b_initializer, collections=c_names) l1 = tf.relu(tf.matmul(s, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable( 'w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) b2 = tf.get_variable( 'b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) out = tf.matmul(l1, w2) + b2 return out
def build(self, rgb, train_mode=False): ''' 定义vgg16 :param rgb 输入为224X224RGB图像 :param train_mode 标识符,如果处于训练阶段,则dropout会打开 ''' self.conv1_1 = self.conv_layer(rgb, "conv1_1", 3, 64) self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2", 64, 64) self.pool1 = self.max_pool(self.conv1_2, "pool1") self.conv2_1 = self.conv_layer(self.pool1, "conv2_1", 64, 128) self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2", 128, 128) self.pool2 = self.max_pool(self.conv2_2, "pool2") self.conv3_1 = self.conv_layer(self.pool2, "conv3_1", 128, 256) self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2", 256, 256) self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3", 256, 256) self.pool3 = self.max_pool(self.conv3_3, "pool3") self.conv4_1 = self.conv_layer(self.pool3, "conv4_1", 256, 512) self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2", 512, 512) self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3", 512, 512) self.pool4 = self.conv_layer(self.conv4_3, "pool4") self.conv5_1 = self.conv_layer(self.pool4, "conv5_1", 512, 512) self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2", 512, 512) self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3", 512, 512) self.pool5 = self.max_pool(self.conv5_3, "pool5") self.fc6 = self.fc_layer(self.pool5, "fc6", 25088, 4096) #25088 = ((224//(2**5))**2)*512 self.relu6 = tf.nn.relu(self.fc6) if train_mode: self.relu6 = tf.nn.dropout(self.relu6, self.dropout) self.fc7 = self.fc_layer(self.relu6, "fc7", 4096, 4096) self.relu7 = tf.relu(self.fc7) if train_mode: self.relu7 = tf.nn.dropout(self.relu7, self.dropout) self.fc8 = self.fc_layer(self.relu7, "fc8", 4096, 1000) self.prob = tf.nn.softmax(self.fc8, name="prob")
def forward_prop(x, params): w1 = params["w1"] w2 = params["w2"] z1 = tf.nn.conv2d(x, w1, [1, 1, 1, 1], padding="same") a1 = tf.nn.relu(z1) p1 = tf.nn.max_pool(a1, ksize=[1, 8, 8, 1], strides=[1, 8, 8, 1], padding="same") z2 = tf.nn.conv2d(p1, w2, [1, 1, 1, 1], padding="same") a2 = tf.relu(z2) p2 = tf.nn.max_pool(a2, ksize=[1, 4, 4, 1], strides=[1, 4, 4, 1], padding="same") p2 = tf.nn.contrib.layers.flatten(p2) z3 = tf.contrib.layers.fully_connected(p2, num_outputs=6, activation_fn=None) return z3
def build_loss(self, seqs_repr, data_ops): """Convert per-location real-valued predictions to a loss.""" # targets tstart = self.batch_buffer // self.target_pool tend = (self.batch_length - self.batch_buffer) // self.target_pool targets = data_ops['label'] targets = tf.identity(targets[:, tstart:tend, :], name='targets_op') # work-around for specifying my own predictions self.preds_adhoc = tf.placeholder(tf.float32, shape=seqs_repr.shape, name='preds-adhoc') # choose link if self.link in ['identity', 'linear']: self.preds_op = tf.identity(seqs_repr, name='preds') elif self.link == 'relu': self.preds_op = tf.relu(seqs_repr, name='preds') elif self.link == 'exp': self.preds_op = tf.exp(tf.clip_by_value(seqs_repr, -50, 50), name='preds') elif self.link == 'exp_linear': self.preds_op = tf.where(seqs_repr > 0, seqs_repr + 1, tf.exp( tf.clip_by_value(seqs_repr, -50, 50)), name='preds') elif self.link == 'softplus': self.preds_op = tf.nn.softplus(seqs_repr, name='preds') elif self.link == 'softmax': # performed in the loss function, but saving probabilities self.preds_prob = tf.nn.softmax(seqs_repr, name='preds') else: print('Unknown link function %s' % self.link, file=sys.stderr) exit(1) # clip if self.target_clip is not None: self.preds_op = tf.clip_by_value(self.preds_op, 0, self.target_clip) targets = tf.clip_by_value(targets, 0, self.target_clip) # sqrt if self.target_sqrt: self.preds_op = tf.sqrt(self.preds_op) targets = tf.sqrt(targets) loss_op = None loss_adhoc = None # choose loss if self.loss == 'gaussian': loss_op = tf.squared_difference(self.preds_op, targets) loss_adhoc = tf.squared_difference(self.preds_adhoc, targets) elif self.loss == 'poisson': loss_op = tf.nn.log_poisson_loss(targets, tf.log(self.preds_op), compute_full_loss=True) loss_adhoc = tf.nn.log_poisson_loss(targets, tf.log(self.preds_adhoc), compute_full_loss=True) elif self.loss == 'negative_binomial': # define overdispersion alphas self.alphas = tf.get_variable( 'alphas', shape=[self.num_targets], initializer=tf.constant_initializer(-5), dtype=tf.float32) self.alphas = tf.nn.softplus(tf.clip_by_value( self.alphas, -50, 50)) tf.summary.histogram('alphas', self.alphas) for ti in np.linspace(0, self.num_targets - 1, 10).astype('int'): tf.summary.scalar('alpha_t%d' % ti, self.alphas[ti]) # compute w/ inverse k = 1. / self.alphas # expand k k_expand = tf.tile(k, [self.batch_size * seq_length]) k_expand = tf.reshape( k_expand, (self.batch_size, seq_length, self.num_targets)) # expand lgamma(k) lgk_expand = tf.tile(tf.lgamma(k), [self.batch_size * seq_length]) lgk_expand = tf.reshape( lgk_expand, (self.batch_size, seq_length, self.num_targets)) # construct loss loss1 = targets * tf.log(self.preds_op / (self.preds_op + k_expand)) loss2 = k_expand * tf.log(k_expand / (self.preds_op + k_expand)) loss3 = tf.lgamma(targets + k_expand) - lgk_expand loss_op = -(loss1 + loss2 + loss3) # adhoc loss1 = targets * tf.log(self.preds_adhoc / (self.preds_adhoc + k_expand)) loss2 = k_expand * tf.log(k_expand / (self.preds_adhoc + k_expand)) loss_adhoc = -(loss1 + loss2 + loss3) elif self.loss == 'negative_binomial_hilbe': # define overdispersion alphas self.alphas = tf.get_variable( 'alphas', shape=[self.num_targets], initializer=tf.constant_initializer(-5), dtype=tf.float32) self.alphas = tf.exp(tf.clip_by_value(self.alphas, -50, 50)) # expand alphas_expand = tf.tile(self.alphas, [self.batch_size * seq_length]) alphas_expand = tf.reshape( alphas_expand, (self.batch_size, seq_length, self.num_targets)) # construct loss loss1 = targets * tf.log(self.preds_op) loss2 = (alphas_expand * targets + 1) / alphas_expand loss3 = tf.log(alphas_expand * self.preds_op + 1) loss_op = -loss1 + loss2 * loss3 # adhoc loss1 = targets * tf.log(self.preds_adhoc) loss3 = tf.log(alphas_expand * self.preds_adhoc + 1) loss_adhoc = -loss1 + loss2 * loss3 elif self.loss == 'gamma': # jchan document loss_op = targets / self.preds_op + tf.log(self.preds_op) loss_adhoc = targets / self.preds_adhoc + tf.log(self.preds_adhoc) elif self.loss == 'cross_entropy': loss_op = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=(targets - 1), logits=self.preds_op) loss_adhoc = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=(targets - 1), logits=self.preds_adhoc) else: print('Cannot identify loss function %s' % self.loss) exit(1) # set NaN's to zero # loss_op = tf.boolean_mask(loss_op, tf.logical_not(self.targets_na[:,tstart:tend])) # reduce lossses by batch and position loss_op = tf.reduce_mean(loss_op, axis=[0, 1], name='target_loss') loss_op = tf.check_numerics(loss_op, 'Invalid loss', name='loss_check') loss_adhoc = tf.reduce_mean(loss_adhoc, axis=[0, 1], name='target_loss_adhoc') tf.summary.histogram('target_loss', loss_op) for ti in np.linspace(0, self.num_targets - 1, 10).astype('int'): tf.summary.scalar('loss_t%d' % ti, loss_op[ti]) self.target_losses = loss_op self.target_losses_adhoc = loss_adhoc # define target sigmas """ self.target_sigmas = tf.get_variable('target_sigmas', shape=[self.num_targets], initializer=tf.constant_initializer(2), dtype=tf.float32) self.target_sigmas = tf.nn.softplus(tf.clip_by_value(self.target_sigmas,-50,50)) tf.summary.histogram('target_sigmas', self.target_sigmas) for ti in np.linspace(0,self.num_targets-1,10).astype('int'): tf.summary.scalar('sigma_t%d'%ti, self.target_sigmas[ti]) # self.target_sigmas = tf.ones(self.num_targets) / 2. """ # dot losses target sigmas # loss_op = loss_op / (2*self.target_sigmas) # loss_adhoc = loss_adhoc / (2*self.target_sigmas) # fully reduce loss_op = tf.reduce_mean(loss_op, name='loss') loss_adhoc = tf.reduce_mean(loss_adhoc, name='loss_adhoc') # add extraneous terms loss_op += self.weights_regularizers # + tf.reduce_mean(tf.log(self.target_sigmas)) loss_adhoc += self.weights_regularizers # + tf.reduce_mean(tf.log(self.target_sigmas)) # track tf.summary.scalar('loss', loss_op) self.targets_op = targets return loss_op, loss_adhoc
def get_train_examples(self, data_dir): """See base class.""" examples = [] train_df = [] with ZipFile('sampleDir.zip', 'r') as zipObj: zipObj.extractall() train_df = pd.read_json("simplified-nq-train.jsonl", orient = 'records', lines = True) print('Our dataset have {} rows and {} columns'.format(df.shape[0], df.shape[1])) gc.collect() for i_main, row in train.iterrows(): document_text = row['document_text'].split() question_text = row['question_text'] for candidate_no, long_answer_candidate in enumerate(row['long_answer_candidates']): target_conv3 = [0] * FLAGS.cont_len target_conv6 = [0] * FLAGS.cont_len target_present = [0] * FLAGS.cont_len q_mask = [1] * FLAGS.ques_len c_mask = [1] * FLAGS.cont_len long_ans_start_tok = long_answer_candidate['start_token'] long_ans_end_tok = long_answer_candidate['end_token'] long_cand_length = long_ans_end_tok - long_ans_start_tok if long_cand_length > FLAGS.cont_len: long_sentence = " ".join(document_text[long_ans_start_tok:long_ans_start_tok + FLAGS.cont_len) else: long_sentence = " ".join(document_text[long_ans_start_tok:long_ans_end_tok) for i in range(long_cand_length+1,FLAG.cont_len): c_mask[i] = 0 if long_ans_start_tok == row['annotations'][0]['long_answer']['start_token'] and \ len(row['annotations'][0]['short_answers']) > 0: #print("this is correct long answer") short_answer_start_token = row['annotations'][0]['short_answers'][0]['start_token'] short_answer_end_token = row['annotations'][0]['short_answers'][0]['end_token'] short_start_idx = short_answer_start_token-long_ans_start_tok short_end_idx = short_answer_end_token-long_ans_start_tok if short_end_idx < cont_len: target_start[short_start_idx] = 1 target_end[short_end_idx] = 1 for i in range(short_start_idx,short_end_idx): target_present[i] = 1 else: smth = "short answer beyond maximum len" ques_length = len(question_text.split()) if ques_length < FLAGS.ques_len: for i in range(ques_length+1,FLAGS.ques_len): q_mask[i] = 0 guid = "train-%d" % (i_main) text_a = tokenization.convert_to_unicode(long_sentence) text_b = tokenization.convert_to_unicode(question_text) target_start = tokenization.convert_to_unicode(target_start) target_end = tokenization.convert_to_unicode(target_end) target_present = tokenization.convert_to_unicode(target_present) q_mask = tokenization.convert_to_unicode(q_mask) c_len = tokenization.convert_to_unicode(c_mask) examples.append(InputExample(guid=guid, text_a=text_a,\ text_b=text_b, target_start=target_start, target_end=target_end, target_present=target_present, q_mask=q_mask, c_mask=c_mask)) return examples def get_labels(self): """See base class.""" return ["target_conv3", "target_conv6", "target_present"] def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, target_conv3 = [0]*FLAGS.cont_len, target_conv6 = [0]*FLAGS.cont_len, target_present = [0]*FLAGS.cont_len, q_mask = [0]*FLAGS.ques_len c_mask = [0]*FLAGS.cont_len is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) #We need exact length to later build the BIDAF tokens_a = tokens_a[0:FLAGS.cont_len] tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) #We need exact length to later build the BIDAF tokens_b = tokens_b[0:FLAGS.cont_len] if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length #The following 3 lines are redundant; just for convention label_conv3 = example.target_conv3 label_conv6 = example.target_conv6 label_present = example.target_present c_mask = example.c_mask q_mask = example.q_mask if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_start: (id = %d)" % (label_conv3)) tf.logging.info("label_end: (id = %d)" % (label_conv6)) tf.logging.info("label_present: (id = %d)" % (label_present)) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, target_conv3 = label_conv3, target_conv6 = label_conv6, target_present = label_present, c_mask = c_mask, q_mask = q_mask, is_real_example=True) return feature def file_based_convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, output_file): """Convert a set of `InputExample`s to a TFRecord file.""" writer = tf.python_io.TFRecordWriter(output_file) for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) return f features = collections.OrderedDict() features["input_ids"] = create_int_feature(feature.input_ids) features["input_mask"] = create_int_feature(feature.input_mask) features["segment_ids"] = create_int_feature(feature.segment_ids) features["target_start_ids"] = create_int_feature([feature.target_conv3]) features["target_end_ids"] = create_int_feature([feature.target_conv6]) features["target_present_ids"] = create_int_feature([feature.target_present]) features["q_mask"] = create_int_feature([feature.q_mask]) features["c_mask"] = create_int_festure([feature.c_mask]) features["is_real_example"] = create_int_feature( [int(feature.is_real_example)]) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writer.write(tf_example.SerializeToString()) writer.close() def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): """Creates an `input_fn` closure to be passed to TPUEstimator.""" name_to_features = { "input_ids": tf.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), "target_start_ids": tf.FixedLenFeature([], tf.int64), "target_end_ids": tf.FixedLenFeature([], tf.int64), "target_present_ids": tf.FixedLenFeature([], tf.int64), "c_mask": tf.FixedLenFeature([], tf.int64), "q_mask": tf.FixedLenFeature([], tf.int64), "is_real_example": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. d = tf.data.TFRecordDataset(input_file) if is_training: d = d.repeat() d = d.shuffle(buffer_size=100) d = d.apply( tf.contrib.data.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder)) return d return input_fn def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def masked_softmax(logits, mask, dim): """ Takes masked softmax over given dimension of logits. Discards padded entries with e^(-inf). Inputs: logits: Numpy array. We want to take softmax over dimension dim. mask: Numpy array of same shape as logits. Has 1s where there's real data in logits, 0 where there's padding dim: int. dimension over which to take softmax Returns: masked_logits: Numpy array same shape as logits. This is the same as logits, but with 1e30 subtracted (i.e. very large negative number) in the padding locations. prob_dist: Numpy array same shape as logits. The result of taking softmax over masked_logits in given dimension. Should be 0 in padding locations. Should sum to 1 over given dimension. """ exp_mask = (1 - tf.cast(mask, 'float')) * (-1e30) # -large where there's padding, 0 elsewhere masked_logits = tf.add(logits, exp_mask) # where there's padding, set logits to -large prob_dist = tf.nn.softmax(masked_logits, dim) return masked_logits, prob_dist def cnn_output_width(input_width, kernel_size, padding_amount, strides): return (input_width - kernel_size + 2*padding_amount) / strides + 1 def deconv_output_shape(input_batch_size, input_size_w, output_channel_size, padding): output_size_h = 1 stride = 2 filter_size_w = 2 if padding == 'VALID': output_size_w = (input_size_w - 1)*stride + filter_size_w elif padding == 'SAME': output_size_w = (input_size_w - 1)*stride + 1 else: raise ValueError("unknown padding") output_shape = tf.stack([input_batch_size, output_size_h, output_size_w, output_channel_size]) return output_shape def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, target_conv3, target_conv6, target_present, q_mask, c_mask, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_sequence_output() output_layer_shape = modeling.get_shape_list(output_layer, expected_rank=3) batch_size = output_layer[0] seq_length = output_layer[1] hidden_size = output_layer[2] hidden_size = output_layer.shape[-1].value SW_weights = tf.get_variable( "similarity_weights", [1, 3*hidden_size], initializer=tf.contrib.layers.xavier_initializer()) c = output_layer[:,1:FLAGS.cont_len+1,:] #do not count the [CLS] q = output_layer[:,FLAGS.cont_len+2:-2,:] #do not count the [SEP] and [SEP] # Hidden size = 2h by convention c_expand = tf.expand_dims(c,2) #[B,N,1,2h] q_expand = tf.expand_dims(q,1) #[B,1,M,2h] c_pointWise_q = c_expand * q_expand #[B,N,M,2h] c_input = tf.tile(c_expand, [1, 1, tf.shape(q)[1], 1]) #fill in to get same dims q_input = tf.tile(q_expand, [1, tf.shape(c)[1], 1, 1]) concat_input = tf.concat([c_input, q_input, c_pointWise_q], -1) # [B,N,M,6h] similarity=tf.reduce_sum(concat_input * self.S_W, axis=3) #[B,N,M] # q_mask shape [B,M] # c_mask shape [B,N] similarity_mask = tf.expand_dims(q_mask, 1) # [B, 1, M] similarity_mask = tf.tile(similarity_mask, [1,tf.shape(c)[1],1]) # [B, N, M] _, c2q_dist = masked_softmax(similarity, similarity_mask, 2) # shape (B, N, M). take softmax over q c2q = tf.matmul(c2q_dist, q) # shape (B, N, 2h) S_max = tf.reduce_max(similarity, axis=2) # shape (B, N) ; reminder N = cont_len _, c_dash_dist = masked_softmax(S_max, c_mask, 1) # distribution of shape (B, N) c_dash_dist_expand = tf.expand_dims(c_dash_dist, 1) # shape (B, 1, N) c_dash = tf.matmul(c_dash_dist_expand, c) # shape (B, 1, 2h) c_c2q = c * c2q # shape (B, N, 2h) c_dash = tf.tile(c_dash, [1,tf.shape(c)[1],1]) # [B, N, 2h] c_c_dash = c * c_dash # shape (B, N, 2h) output = tf.concat([c2q, c_c2q, c_c_dash], axis=2) # (B, N, 2h * 3) output = tf.nn.dropout(output, 0.9) blended_reps = tf.concat([c, output], axis=2) # (B, N, 8h) ### ADD MODELING LAYER .. but first add some more data pooled_output = model.get_pooled_output() # Shape (B, 2h) pooled_exp = tf.expand_dims(pooled_output, 1) # shape (B, 1, 2h) pooled_tile = tf.tile(pooled_tile, [1, FLAGS.cont_len, 1]) # shape (B, cont_len, 2h) model_input = tf.concat([blended_reps, pooled_tile], 2) # shape (B, cont_len, 10h) # we will go two different routes. targets_conv will come from convolution layers and target_present from lstm.. # the following is route 1: fw_cell = tf.nn.rnn_cell.BasicLSTMCell(256) bw_cell = tf.nn.rnn_cell.BasicLSTMCell(256) rnn_outputs, rnn_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell, inputs=model_input, sequence_length=FLAGS.cont_len, dtype=tf.float64) rnn_outputs = tf.concat(rnn_outputs, 2) # Shape (B, cont_len, 256*2) rnn_outputs = tf.relu(rnn_outputs) # Now copying from run_nq.py rnn_output_weights = tf.get_variable( "rnn_output_w", [1, 256], initializer=tf.truncated_normal_initializer(stddev=0.02)) rnn_outout_bias = tf.get_variable( "rnn_output_b", [1], initializer=tf.zeros_initializer()) rnn_outputs = tf.reshape(rnn_outputs, [batch_size*FLAGS.cont_len, hidden_size]) # shape [B*N, 2h] rnn_logits = tf.matmul(rnn_outputs, rnn_output_weights, transpose_b=True) # shape [B*N, 1] rnn_logits = tf.nn.bias_add(rnn_logits, rnn_output_bias) # shape [B*N, 1] rnn_logits = tf.reshape(rnn_logits, [batch_size, FLAGS.cont_len, 1]) #shape [B, N, 1] rnn_logits = tf.squeeze(rnn_logits, axis=2) #shape [B, N] rnn_preds = tf.sigmoid(rnn_logits) rnn_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=targets_present, logits=rnn_logits) # Now Route 2: Convolutions # Expand dims to make it a 3D for the convolution: conv_input = tf.expand_dims(model_input, axis=1) # Change the shape to [B, 1, cont_len, 5*emb_size] #U-NET downladder filters filter1 = tf.get_variable("conv1_filter", shape=[1, 3, hidden_size*5, 64]) # [h, w, in_size, out_size] filter2 = tf.get_variable("conv2_filter", shape=[1, 3, 64, 64]) filter3 = tf.get_variable("conv3_filter", shape=[1, 3, 64, 128]) filter4 = tf.get_variable("conv4_filter", shape=[1, 3, 128, 128]) filter5 = tf.get_variable("conv5_filter", shape=[1, 3, 128, 256]) filter6 = tf.get_variable("conv6_filter", shape=[1, 3, 256, 256]) #U-NET upladder filters up6_filter = tf.get_variable("up6_filter", shape=[1, 2, 256, 256]) filter7 = tf.get_variable("conv3_filter", shape=[1, 3, 256, 256]) up7_filter = tf.get_variable("up6_filter", shape=[1, 2, 384, 384]) filter8 = tf.get_variable("conv3_filter", shape=[1, 3, 448, 448]) filter9 = tf.get_variable("conv3_filter", shape=[1, 3, 448, 1]) # Output shapes based on default cont_len 350 conv1 = tf.nn.conv2d(conv_input, filter=filter1, strides=[1, 1, 1, 1], padding="VALID") # shape [B, 1, 348, 64] conv1 = tf.nn.relu(conv1) conv2 = tf.nn.conv2d(conv2, filter=filter2, strides=[1, 1, 1, 1], padding="VALID") # shape [B, 1, 346, 64] conv2 = tf.nn.relu(conv2) maxp2 = tf.nn.max_pool(conv2, ksize=[1, 1, 2, 1], strides=[1, 1, 1, 1], padding='VALID') # shape [B, 1, 178, 64] conv3 = tf.nn.conv2d(maxp2, filter=filter3, strides=[1, 1, 1, 1], padding="VALID") # shape [B, 1, 176, 128] conv3 = tf.nn.relu(conv3) conv4 = tf.nn.conv2d(conv4, filter=filter4, strides=[1, 1, 1, 1], padding="VALID") # shape [B, 1, 174, 128] conv4 = tf.nn.relu(conv4) maxp4 = tf.nn.max_pool(conv4, ksize=[1, 1, 2, 1], strides=[1, 1, 1, 1], padding='VALID') # shape [B, 1, 87, 128] conv5 = tf.nn.conv2d(maxp4, filter=filter5, strides=[1, 1, 1, 1], padding="VALID") # shape [B, 1, 85, 256] conv5 = tf.nn.relu(conv5) conv6 = tf.nn.conv2d(conv6, filter=filter4, strides=[1, 1, 1, 1], padding="VALID") # shape [B, 1, 83, 256] conv6 = tf.nn.relu(conv6) up6_output_shape = deconv_output_shape(conv6.shape[0], conv6.shape[2], conv6.shape[3], "VALID") conv6_up = tf.nn.conv2d_transpose(conv6, filters = up6_filter, output_shape = up6_output_shape, strides = [1, 1, 1, 1], padding = "VALID") # shape [B, 1, 166, 256] # Convolve until shape is equal to conv4 (174). Use padding = SAME to increase width. padding = [[0,0],[0,0],[3,3],[0,0]] conv6_padded = tf.pad(conv6,paddings,"CONSTANT") # shape [B, 1, 172, 256] conv7 = tf.nn.conv2d(conv6_padded, filter=filter7, strides=[1, 1, 1, 1], padding="SAME") # shape [B, 1, 174, 256] conv7 = tf.nn.relu(conv7) conc_4n7 = tf.concat([conv4, conv7], -1) # [B, 1 , 174, 384] up7_output_shape = deconv_output_shape(conc_4n7.shape[0], conc_4n7.shape[2], conc_4n7.shape[3], "VALID") conv7_up = tf.nn.conv2d_transpose(conc_4n7, filters = up7_filter, output_shape = up7_output_shape, strides = [1, 1, 1, 1], padding = "VALID") # shape [B, 1, 348, 384] conc_7n1 = tf.concat([conv7_up, conv1], -1) # [B, 1 , 348, 448] conv8 = tf.nn.conv2d(conc_7n1, filter=filter8, strides=[1, 1, 1, 1], padding="SAME") # shape [B, 1, 350, 1] conv_logits = tf.squeeze(conv8, axis = 3) # shape [B, 1, 350] conv_logits = tf.squeeze(conv_logits, axis = 1) # shape [B, cont_len] conv_preds = tf.nn.sigmoid(conv_logits) conv_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_present, logits=conv_logits) with tf.variable_scope("loss"): total_loss = rnn_loss + conv_loss return (total_loss, rnn_preds, conv_preds) def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings): """Returns `model_fn` closure for TPUEstimator.""" # This is the most confusing one. Note that “labels” are not passed on by the model_fn_builder. # They are actually passed on inside tpu_estimator when it calls the model_fn. We don’t see how. # Apparently we need to treat labels as per example, not per batch (to be confirmed). def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] target_start_ids = features["target_start_ids"] target_end_ids = features["target_end_ids"] target_present_ids = features["target_present_ids"] q_mask = features["q_mask"] c_mask = features["c_mask"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, probabilities) = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, target_start_ids, target_end_ids, target_present_ids, q_mask, c_mask, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec return model_fn # This function is not used by this file but is still used by the Colab and # people who depend on it. def input_fn_builder(features, seq_length, is_training, drop_remainder): """Creates an `input_fn` closure to be passed to TPUEstimator.""" all_input_ids = [] all_input_mask = [] all_segment_ids = [] all_target_start_ids = [] all_target_end_ids = [] all_target_present_ids = [] all_q_mask = [] all_c_mask = [] for feature in features: all_input_ids.append(feature.input_ids) all_input_mask.append(feature.input_mask) all_segment_ids.append(feature.segment_ids) all_target_start_ids.append(feature.target_start) all_target_end_ids.append(feature.target_end) all_target_present_ids.append(feature.target_present) all_q_mask.append(feature.q_mask) all_c_mask.append(feature.c_mask) def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] num_examples = len(features) # This is for demo purposes and does NOT scale to large data sets. We do # not use Dataset.from_generator() because that uses tf.py_func which is # not TPU compatible. The right way to load data is with TFRecordReader. d = tf.data.Dataset.from_tensor_slices({ "input_ids": tf.constant( all_input_ids, shape=[num_examples, seq_length], dtype=tf.int32), "input_mask": tf.constant( all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32), "segment_ids": tf.constant( all_segment_ids, shape=[num_examples, seq_length], dtype=tf.int32), "target_start_ids": tf.constant( all_target_start_ids, shape=[num_examples, seq_length], dtype=tf.int32), "segment_ids": tf.constant( all_target_end_ids, shape=[num_examples,seq_length], dtype=tf.int32), "segment_ids": tf.constant( all_target_present_ids, shape=[num_examples, seq_length], dtype=tf.int32), }) if is_training: d = d.repeat() d = d.shuffle(buffer_size=100) d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) return d return input_fn # This function is not used by this file but is still used by the Colab and # people who depend on it. def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Convert a set of `InputExample`s to a list of `InputFeatures`.""" features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) features.append(feature) return features def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "": KeplerProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples if __name__ == "__main__": flags.mark_flag_as_required("data_dir") flags.mark_flag_as_required("task_name") flags.mark_flag_as_required("vocab_file") flags.mark_flag_as_required("bert_config_file") flags.mark_flag_as_required("output_dir") tf.app.run()
N = 20 D = 3 D1 = 4 D2 = 5 X_train = np.random.rand(N,D) X_train = 2*X_train + 3 x = tf.placeholder(tf.float64, shape=[None,D], name='x-input') mu1 = 0.0 mu2 = mu1 muC = mu2 std1 = 0.1 std2 = std1 stdC = std2 const1 = 0.1 const2 = const1 W1 = tf.Variable(tf.truncated_normal(shape=[D,D1], mean=mu1, stddev=std1, dtype=tf.float64)) b1 = tf.Variable(tf.constant(const1,shape=[D1],dtype=tf.float64)) W2 = tf.Variable(tf.truncated_normal(shape=[D,D1], mean=mu2, stddev=std2, dtype=tf.float64)) b2 = tf.Variable(tf.constant(const2,shape=[D2],dtype=tf.float64)) C = tf.Variable(tf.truncated_normal(shape=[D,D1], mean=muC, stddev=stdC, dtype=tf.float64)) z1 = tf.matmul(x,W1) + b1 a1 = tf.relu(z) z2 = tf.matmul(a1,W2) + b2 a2 = tf.rely(z2) with tf.Session() sess:
def relu_kernel(self, x): return tf.relu(tf.expand_dims(x, axis=self.unsqueeze_dim) - self.dict)
def activation(x): return tf.relu(x)
def build_predict( self, inputs, reverse_preds=None, embed_penultimate=False, target_subset=None, save_reprs=False, ): """Construct per-location real-valued predictions.""" assert inputs is not None print("Targets pooled by %d to length %d" % (self.hp.target_pool, self.hp.seq_length // self.hp.target_pool)) if self.hp.augment_mutation > 0: # sample mutation binary mask across sequences mut_mask_probs = self.hp.augment_mutation * np.ones( (self.hp.seq_length, 1)) mut_mask_dist = tfp.distributions.Bernoulli(probs=mut_mask_probs, dtype=tf.float32) mut_mask = mut_mask_dist.sample(tf.shape(inputs)[0]) # sample random nucleotide for mutations mut_1hot_probs = 0.25 * np.ones((self.hp.seq_length, 4)) mut_1hot_dist = tfp.distributions.OneHotCategorical( probs=mut_1hot_probs, dtype=tf.float32) mut_1hot = mut_1hot_dist.sample(tf.shape(inputs)[0]) # modify sequence inputs_mut = inputs - mut_mask * inputs + mut_mask * mut_1hot inputs = tf.cond(self.is_training, lambda: inputs_mut, lambda: inputs) ################################################### # convolution layers ################################################### filter_weights = [] layer_reprs = [inputs] seqs_repr = inputs for layer_index in range(self.hp.cnn_layers): with tf.variable_scope("cnn%d" % layer_index, reuse=tf.AUTO_REUSE): # convolution block args_for_block = self._make_conv_block_args( layer_index, layer_reprs) seqs_repr = layers.conv_block(seqs_repr=seqs_repr, **args_for_block) # save representation layer_reprs.append(seqs_repr) if save_reprs: self.layer_reprs = layer_reprs # final nonlinearity if self.hp.nonlinearity == "relu": seqs_repr = tf.nn.relu(seqs_repr) elif self.hp.nonlinearity == "gelu": seqs_repr = tf.nn.sigmoid(1.702 * seqs_repr) * seqs_repr else: print('Unrecognized nonlinearity "%s"' % self.hp.nonlinearity, file=sys.stderr) exit(1) ################################################### # slice out side buffer ################################################### # update batch buffer to reflect pooling seq_length = seqs_repr.shape[1].value pool_preds = self.hp.seq_length // seq_length assert self.hp.batch_buffer % pool_preds == 0, ( "batch_buffer %d not divisible" " by the CNN pooling %d") % (self.hp.batch_buffer, pool_preds) batch_buffer_pool = self.hp.batch_buffer // pool_preds # slice out buffer seq_length = seqs_repr.shape[1] seqs_repr = seqs_repr[:, batch_buffer_pool:seq_length - batch_buffer_pool, :] seq_length = seqs_repr.shape[1] ################################################### # final layer ################################################### if embed_penultimate: final_repr = seqs_repr else: with tf.variable_scope("final", reuse=tf.AUTO_REUSE): final_filters = self.hp.sum_targets * self.hp.target_classes final_repr = tf.layers.dense( inputs=seqs_repr, units=final_filters, activation=None, kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode="fan_in"), kernel_regularizer=tf.contrib.layers.l1_regularizer( self.hp.final_l1_scale), ) print("Convolution w/ %d %dx1 filters to final targets" % (final_filters, seqs_repr.shape[2])) if target_subset is not None: # get convolution parameters filters_full = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, "final/dense/kernel")[0] bias_full = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, "final/dense/bias")[0] # subset to specific targets filters_subset = tf.gather(filters_full, target_subset, axis=1) bias_subset = tf.gather(bias_full, target_subset, axis=0) # substitute a new limited convolution final_repr = tf.tensordot(seqs_repr, filters_subset, 1) final_repr = tf.nn.bias_add(final_repr, bias_subset) # update # targets self.hp.sum_targets = len(target_subset) # expand length back out if self.hp.target_classes > 1: final_repr = tf.reshape( final_repr, (-1, seq_length, self.hp.sum_targets, self.hp.target_classes), ) # transform for reverse complement if reverse_preds is not None: final_repr = tf.cond( reverse_preds, lambda: tf.reverse(final_repr, axis=[1]), lambda: final_repr, ) ################################################### # link function ################################################### if embed_penultimate: predictions = final_repr else: # work-around for specifying my own predictions # self.preds_adhoc = tf.placeholder( # tf.float32, shape=final_repr.shape, name='preds-adhoc') # float 32 exponential clip max exp_max = 50 # choose link if self.hp.link in ["identity", "linear"]: predictions = tf.identity(final_repr, name="preds") elif self.hp.link == "relu": predictions = tf.relu(final_repr, name="preds") elif self.hp.link == "exp": final_repr_clip = tf.clip_by_value(final_repr, -exp_max, exp_max) predictions = tf.exp(final_repr_clip, name="preds") elif self.hp.link == "exp_linear": predictions = tf.where( final_repr > 0, final_repr + 1, tf.exp(tf.clip_by_value(final_repr, -exp_max, exp_max)), name="preds", ) elif self.hp.link == "softplus": final_repr_clip = tf.clip_by_value(final_repr, -exp_max, 10000) predictions = tf.nn.softplus(final_repr_clip, name="preds") else: print("Unknown link function %s" % self.hp.link, file=sys.stderr) exit(1) # clip if self.hp.target_clip is not None: predictions = tf.clip_by_value(predictions, 0, self.hp.target_clip) # sqrt if self.hp.target_sqrt: predictions = tf.sqrt(predictions) return predictions
def build_model(self): self.user = tf.placeholder(shape=[None,],dtype=tf.int32) self.item = tf.placeholder(shape=[None,],dtype=tf.int32) self.text = tf.placeholder(shape=[None,self.review_length],dtype=tf.int32) self.rating = tf.placeholder(shape=[None,],dtype=tf.float32) self.phrase = tf.placeholder(False,dtype=tf.bool) with tf.name_scope('embedding/word_embedding'): word_embedding = tf.Variable(tf.random.uniform(shape=[self.vocab_size,self.factor_num],minval=-0.1,maxval=0.1)) context = tf.nn.embedding_lookup(word_embedding,self.text) with tf.name_scope('embedding/user_embedding'): user_embedding = tf.Variable(tf.random_uniform(shape=[self.user_num,self.factor_num],minval=-0.1,maxval=0.1)) uvec = tf.nn.embedding_lookup(user_embedding,self.user) with tf.name_scope('embedding/item_embedding'): item_embedding = tf.Variable(tf.random_uniform(shape=[self.user_num,self.factor_num],minval=-0.1,maxval=0.1)) ivec = tf.nn.embedding_lookup(item_embedding,self.item) # convoluntional layers context = tf.expand_dims(context,axis=-1) # None*review_length*factor_num*1 pools = [] for size in self.filter_size: filter_kernal = [size,self.factor_num,self.filter_num,1] with tf.name_scope('conv_{}'.format(size)): filter_weights = tf.Variable(tf.random_normal(shape=filter_kernal,stddev=0.1)) filter_biases = tf.Variable(tf.random_normal(shape=[self.filter_num],stddev=0.1)) conv = tf.nn.conv2d(context,filter_weights,strides=[1,1,1,1],padding='VALID') conv = tf.nn.bias_add(conv,filter_biases) pool_kernal = [1,self.review_length-size+1,1,1] pool = tf.nn.max_pool(conv,ksize=pool_kernal,strides=[1,1,1,1],padding='VALID') pools.append(pool) num_feature_total = self.filter_num * len(self.filter_size) pooled_total = tf.concat(pools,3) pooled_total = tf.reshape(pooled_total,[-1,num_feature_total]) # gate with tf.name_scope('gate/user_gate'): Wxcr = tf.Variable(tf.random_normal(shape=[num_feature_total,self.factor_num])) Wxur = tf.Variable(tf.random_normal(shape=[self.factor_num,self.factor_num])) Wxch = tf.Variable(tf.random_normal(shape=[num_feature_total, self.factor_num])) Wxuh = tf.Variable(tf.random_normal(shape=[self.factor_num, self.factor_num])) bxr = tf.Variable(tf.constant(0.0,shape=[self.factor_num])) bxh = tf.Variable(tf.constant(0.0, shape=[self.factor_num])) Wxcz = tf.Variable(tf.random_normal(shape=[num_feature_total,self.factor_num])) Wxuz = tf.Variable(tf.random_normal(shape=[self.factor_num,self.factor_num])) bxz = tf.Variable(tf.constant(0.0,shape=[self.factor_num])) xr = tf.add_n(tf.matmul(pooled_total,Wxcr),tf.matmul(uvec,Wxur),bxr) xz = tf.add_n(tf.matmul(pooled_total,Wxcz),tf.matmul(uvec,Wxuz),bxz) uvec_hat = tf.tanh(tf.add_n(tf.matmul(pooled_total,Wxch),tf.maltiply(xr,tf.matmul(uvec,Wxuh)),bxh)) uvec_final = tf.multiply(xz,uvec_hat) + tf.multiply((1-xz),uvec_hat) with tf.name_scope('gate/item_gate'): Wycr = tf.Variable(tf.random_normal(shape=[num_feature_total,self.factor_num])) Wyir = tf.Variable(tf.random_normal(shape=[self.factor_num,self.factor_num])) Wych = tf.Variable(tf.random_normal(shape=[num_feature_total, self.factor_num])) Wyuh = tf.Variable(tf.random_normal(shape=[self.factor_num, self.factor_num])) byr = tf.Variable(tf.constant(0.0,shape=[self.factor_num])) byh = tf.Variable(tf.constant(0.0, shape=[self.factor_num])) Wycz = tf.Variable(tf.random_normal(shape=[num_feature_total,self.factor_num])) Wyiz = tf.Variable(tf.random_normal(shape=[self.factor_num,self.factor_num])) byz = tf.Variable(tf.constant(0.0,shape=[self.factor_num])) yr = tf.add_n(tf.matmul(pooled_total,Wycr),tf.matmul(ivec,Wyir),byr) yz = tf.add_n(tf.matmul(pooled_total,Wycz),tf.matmul(uvec,Wyiz),byz) ivec_hat = tf.tanh(tf.add_n(tf.matmul(pooled_total, Wych),tf.maltiply(yr,tf.matmul(ivec, Wyuh)), byh)) ivec_final = tf.multiply(yz, ivec_hat) + tf.multiply((1 - yz), ivec_hat) if self.phrase: final = tf.concat([uvec,ivec],axis=1) else: final = tf.concat([uvec_final,ivec_final],axis=1) with tf.name_scope('full_connected'): W1 = tf.Variable(tf.random_normal(shape=[2*self.factor_num,self.factor_num])) b1 = tf.Variable(tf.constant(0.0,shape=[self.factor_num])) W2 = tf.Variable(tf.random_normal(shape=[self.factor_num, 1])) b2 = tf.Variable(tf.constant(0.0, shape=[1])) f1 = tf.relu(tf.add(tf.matmul(final,W1),b1)) f2 = tf.relu(tf.add(tf.matmul(f1, W2), b2)) self.mse = tf.reduce_mean(tf.square(tf.subtract(tf.reduce_sum(f2,axis=1),self.rating))) self.mae = tf.reduce_mean(tf.abs(tf.subtract(tf.reduce_sum(f2, axis=1), self.rating))) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.mse) init = tf.global_variables_initializer() self.sess.run(init)
def maxpool_layer_dual_objective(kernel_shape, strides, with_relu, mu_in, lam_out, lb, ub, nominal=None): """Calculates the contribution to the dual objective of an N-D max pool layer. Maximises (over y in [lb, ub]):: mu_l^T y - lam_l^T h_l(y) where `h` is the specified max pool operation. If `nominal` is not `None`, then inputs and maxima are interpreted relative to nominal inputs and outputs respectively, so we actually maximise:: mu_l^T y - lam_l^T (h_l(nominal+y) - h_l(nominal))`. This formulation only supports maxpools that cover the input space without gaps. Overlaps are permitted, although they will give rise to an overestimate of the dual objective rather than a tight value. Args: kernel_shape: Integer list of `[kernel_height, kernel_width]`, or `None` to aggregate over the layer`s entire spatial extent. strides: Integer list of `[vertical_stride, horizontal_stride]`. with_relu: Whether to apply `tf.nn.relu` to the maxpool. mu_in: (N+3)D tensor of shape (num_classes, batch_size, input_height, input_width, layer_channels) containing Lagrange multipliers for the neurons' linear calculations. lam_out: (N+3)D tensor of shape (num_classes, batch_size, output_height, output_width, layer_channels) containing Lagrange multipliers for the neurons' maxpool calculations. lb: (N+2)D tensor of shape (batch_size, input_height, input_width, layer_channels) containing lower bounds of the neurons' pre-maxpool values. ub: (N+2)D tensor of shape (batch_size, input_height, input_width, layer_channels) containing upper bounds of the neurons' pre-maxpool values. nominal: (N+2)D tensor of shape (batch_size, input_height, input_width, layer_channels) containing nominal input values. Inputs bounds are interpreted relative to these nominal values. Defaults to zero. Returns: 2D tensor of shape (num_classes, batch_size) containing dual objective contribution for each example. Raises: ValueError: if the pools overlap or have gaps. """ if nominal is not None: # TODO(stanforth) investigate a more numerically stable implementation res = maxpool_layer_dual_objective(kernel_shape, strides, with_relu, mu_in, lam_out, nominal + lb, nominal + ub) # Infer the nominal outputs. if kernel_shape is None: nominal_out = tf.reduce_max(nominal, axis=list( range(1, nominal.shape.ndims - 1))) else: nominal_out = tf.nn.max_pool(nominal, ksize=kernel_shape, padding='VALID', strides=([1] + strides + [1])) if with_relu: nominal_out = tf.relu(nominal_out) res -= tf.reduce_sum(mu_in * nominal, axis=list(range(2, mu_in.shape.ndims))) res += tf.reduce_sum(lam_out * nominal_out, axis=list(range(2, lam_out.shape.ndims))) return res # Search for maximum by branching over inputs (kernel elements). # Broadcast the tensors to match what `fn` will operate with, i.e. shape # (num_classes, batch_size, output_height, output_width, # kernel_height * kernel_width, layer_channels). num_classes = mu_in.shape[0].value batch_size = tf.shape(mu_in)[1] input_shape = mu_in.shape[2:].as_list() layer_channels = mu_in.shape[-1].value output_spatial_shape = lam_out.shape[2:-1].as_list() nd = lam_out.shape.ndims - 3 if kernel_shape is None: # Maxpool will be across the entire layer (in each channel). kernel_size = _prod(input_shape[:-1]) lb_bc = lb ub_bc = ub mu_bc = mu_in else: for i in range(len(kernel_shape)): if kernel_shape[i] < strides[i]: raise ValueError( 'The pools must tile the entire input space without gaps.') padding = 'VALID' # Determine the fan-out of each input, where the pools overlap. # Builds a tensor of shape (1, 1, input_height, input_width, 1) of the form # [[1,1,2,1,1], [1,1,2,1,1], [2,2,4,2,2], [1,1,2,1,1], [1,1,2,1,1]] # (illustrated here with 3x3 kernel with stride 2 on a 5x5 input). overlap = common.conv_reduce_sum(tf.ones( dtype=mu_in.dtype, shape=([1, 1] + output_spatial_shape + [1] + kernel_shape + [1])), input_shape, padding=padding, strides=strides) # Share mu values equally amongst pools where they overlap. mu_in /= overlap # Broadcast the bounds and mu vars where the kernel applications overlap. kernel_size = _prod(kernel_shape) lb_bc = common.conv_broadcast(lb, kernel_shape, padding=padding, strides=strides) ub_bc = common.conv_broadcast(ub, kernel_shape, padding=padding, strides=strides) # Temporarily combine the (num_classes, batch_size) dimensions # while applying the broadcast to mu. mu_bc = tf.reshape(mu_in, shape=([num_classes * batch_size] + mu_in.shape[2:].as_list())) mu_bc = common.conv_broadcast(mu_bc, kernel_shape, padding=padding, strides=strides) # conv_broadcast has returned tensors of shape # (N, output_height, output_width, 1, kernel_height, kernel_width, C). lb_bc = tf.reshape(lb_bc, shape=([1, batch_size] + output_spatial_shape + [kernel_size, layer_channels])) ub_bc = tf.reshape(ub_bc, shape=([1, batch_size] + output_spatial_shape + [kernel_size, layer_channels])) mu_bc = tf.reshape( mu_bc, shape=([num_classes, batch_size] + output_spatial_shape + [kernel_size, layer_channels])) lb_bc += tf.zeros_like(mu_bc) ub_bc += tf.zeros_like(mu_bc) # Use the same lambda for each input. lam_bc = tf.expand_dims(lam_out, axis=(nd + 2)) # All xx_bc tensors are shaped as (class, N, H, W, i, C) # where i ranges over inputs (kernel elements). # To calculate for input (kernel element) i, we need to sum over inputs j. # Set up xx_i, xx_j tensors shaped as (class, N, H, W, i, j, C) # where i,j both range over inputs (kernel elements). # y_i = tf.expand_dims(y, nd+3) (will create inside `fn`) mu_j = tf.expand_dims(mu_bc, nd + 2) lb_j = tf.expand_dims(lb_bc, nd + 2) ub_j = tf.expand_dims(ub_bc, nd + 2) # Only consider j != i. mask = 1.0 - tf.expand_dims(tf.eye(kernel_size), -1) def fn(y): """Optimal dual objective, conditional on the value of the maxpool. For each input (kernel element) i, for the given y_i, maximises (over z_j in [lb_j, min{y_i, ub_j}] and constraining z_i=y_i):: mu^T z - lam y_i This will be infeasible if y_i < lb_j for some j, (also if y_i < 0 in the case of relu+maxpool), so maxpool cannot be attained at input i. The returned tensor is unspecified for such elements. Args: y: (N+4)D tensor of shape (num_classes, batch_size, output_height, output_width, kernel_height * kernel_width, layer_channels) containing, for each input (kernel element) i, a value of maxpool assumed to be attained at input i. Returns: Tensor of same shape as `y` containing, for each input (kernel element) i, the optimal value of the dual objective, conditional the maxpool being equal to `y` with the maximum attained at input i. """ y_i = tf.expand_dims(y, nd + 3) # Maximise sum_{j!=i} mu_j y_j where y_j <= y_i for all j!=i. obj = max_linear(mask * mu_j, lb_j, tf.minimum(ub_j, y_i), axis=(nd + 3)) return obj + (mu_bc - lam_bc) * y lb_max = tf.reduce_max(lb_bc, axis=(nd + 2), keepdims=True) if with_relu: lb_max = tf.maximum(lb_max, 0.) _, attained = common.concave_max_binsearch(fn, tf.zeros_like(lb_bc) + lb_max, ub_bc) # Filter out any infeasible choices of i. attained = tf.where( lb_max <= ub_bc, attained, tf.zeros_like(attained) + tf.reduce_min(attained, axis=(nd + 2), keepdims=True)) # Maximise over which input (kernel element) maximises the maxpool. per_neuron_objective = tf.reduce_max(attained, axis=(nd + 2)) if with_relu: # The relu+maxpool may additionally be 'maximised' by zero. # Calculate optimal dual objective, conditional on all y_i <= 0. # Maximise (over z_j in [lb_j, min{0, ub_j}]):: # mu^T z - lam 0 attained_zero = max_linear(mu_bc, lb_bc, tf.minimum(ub_bc, 0.), axis=(nd + 2)) # Filter out any infeasible cases. per_neuron_objective = tf.where( tf.squeeze(lb_max, axis=(nd + 2)) <= 0., tf.maximum(per_neuron_objective, attained_zero), per_neuron_objective) return tf.reduce_sum(per_neuron_objective, axis=list(range(2, per_neuron_objective.shape.ndims)))
import tensorflow as tf # placeholder for input to the computation x = tf.placeholder(dtype=tf.float32, name="x") # bias variable for the affine weight transformation b = tf.Variable(tf.zeros(100)) # weight variable for the affine wegiht transformation with random values W = tf.Variable(tf.random_uniform([784, 100]), tf.float32) # activation as a function of the weight transformation a = tf.relu(tf.matmul(W, x) + b) # cost computed as a function of the activation # and the target optimization task C = [...] # Start session to run the computational graph session = tf.InteractiveSession() # Initialize all variables, in this example only the weight # matrix depends on an initialization tf.global_variables_initializer() for i in range(epochs): result = session.run(C, feed_dict={x: data[batch_indices]}) print(i, result)
import numpy as np tf.set_random_seed(777) x_data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) y_data = np.array([[0], [1], [1], [0]], dtype=np.float32) # x,y,w,b hypothesis, cost, train # sigmoid X = tf.placeholder(tf.float32, shape=[None, 2]) y = tf.placeholder(tf.float32, shape=[None, 1]) w1 = tf.Variable(tf.zeros([2, 100]), name='weight1') b1 = tf.Variable(tf.zeros([100]), name='bias1') layer1 = tf.relu(tf.matmul(X, w1) + b1) w2 = tf.Variable(tf.zeros([100, 50]), name='weight2') b2 = tf.Variable(tf.zeros([50]), name='bias2') layer2 = tf.sigmoid(tf.matmul(layer1, w2) + b2) w3 = tf.Variable( tf.zeros([50, 1]), name='weight3', ) b3 = tf.Variable(tf.zeros([1]), name='bias3') hypothesis = tf.sigmoid(tf.matmul(layer2, w3) + b3) # tf.matmul(x,w) => w*x x => (5.3) w => (3,1) w*x => (5,1) 행렬연산을 도와줌 cost = -tf.reduce_mean(y * tf.log(hypothesis) +
def inference(self, input_images, sentences, embedding_dictionary, trainable, initialized_vgg_parameter_file=None): with tf.variable_scope(self.name): # extract image context feature tensor_trainable = tf.constant(trainable) vgg19 = VGG19('vgg19') vgg19_predict, vgg19_context = vgg19.inference( input_images, tensor_trainable, initialized_vgg_parameter_file) vgg19_context_shape = vgg19_context.get_shape().as_list() batch_size = vgg19_context_shape[0] vgg19_context_num = vgg19_context_shape[1] * vgg19_context_shape[2] vgg19_context_dim = vgg19_context_shape[3] vgg19_context_reshape = tf.reshape( vgg19_context, [-1, vgg19_context_num, vgg19_context_dim]) vgg19_context_reshape_mean = tf.reduce_mean( vgg19_context_reshape, 1) init_memory = vgg19_context_reshape_mean for i in xrange(_MLP_LAYER_NUMBER_): init_memory = _construct_full_connection_layer( init_memory, _RNN_HIDDEN_NUMER_, name='init_memory_fc' + str(i)) init_memory = tf.contrib.layers.batch_norm( init_memory, decay=self.momentum, updates_collections=None, epsilon=self.epsilon, scale=True, is_training=trainable, scope='init_memory_bn' + str(i)) init_lstm_output = vgg19_context_reshape_mean for i in xrange(_MLP_LAYER_NUMBER_): init_lstm_output = _construct_full_connection_layer( init_lstm_output, _RNN_HIDDEN_NUMER_, name='init_hidden_state_fc' + str(i)) init_lstm_output = tf.contrib.layers.batch_norm( init_lstm_output, decay=self.momentum, updates_collections=None, epsilon=self.epsilon, scale=True, is_training=trainable, scope='init_lstm_output_bn' + str(i)) lstm_state = tf.contrib.rnn.LSTMStateTuple(init_memory, init_lstm_output) lstm = tf.contrib.rnn.LSTMCell( _RNN_HIDDEN_NUMER_, initializer=tf.random_normal_initializer(stddev=0.03)) vgg19_context_flat = tf.reshape(vgg19_context_reshape, [-1, vgg19_context_dim]) max_sentence_length = sentences.get_shape().as_list()[-1] print max_sentence_length print 'aaaaaaaaa' dim_embed = embedding_dictionary.get_shape().as_list()[-1] word_number = embedding_dictionary.get_shape().as_list()[0] tensor_output = [] tensor_output_prob = [] for i in xrange(max_sentence_length): # attention mechanism context_encode1 = _construct_full_connection_layer( vgg19_context_flat, vgg19_context_dim, name='att_fc11') context_encode1 = tf.nn.relu(context_encode1) context_encode1 = tf.contrib.layers.batch_norm( context_encode1, decay=self.momentum, updates_collections=None, epsilon=self.epsilon, scale=True, is_training=trainable, scope='att_bn11' + str(i)) context_encode2 = _construct_full_connection_layer( lstm_state, vgg19_context_dim, name='att_fc21') context_encode2 = tf.nn.relu(context_encode2) context_encode2 = tf.contrib.layers.batch_norm( context_encode2, decay=self.momentum, updates_collections=None, epsilon=self.epsilon, scale=True, is_training=trainable, scope='att_bn21' + str(i)) context_encode2 = tf.tile(tf.expand_dims(context_encode2, 1), [1, vgg19_context_num, 1]) context_encode2 = tf.reshape(context_encode2, [-1, vgg19_context_dim]) context_encode = tf.relu(context_encode1 + context_encode2) context_encode = tf.cond( tensor_trainable, lambda: tf.nn.dropout(context_encode, 0.5), lambda: context_encode) attention = _construct_full_connection_layer(context_encode, 1, name='att_1') attention = tf.nn.relu(attention) attention = tf.reshape(attention, [-1, vgg19_context_num]) attention = tf.nn.softmax(attention) if i == 0: word_emb = tf.zeros([batch_size, dim_embed]) weighted_context = tf.identity(vgg19_context_reshape_mean) else: word_emb = tf.cond( is_train, lambda: tf.nn.embedding_lookup( embedding_dictionary, sentences[:, i - 1]), lambda: word_emb) weighted_context = tf.reduce_sum( vgg19_context_reshape * tf.expand_dims(attention, 2), 1) lstm_output, lstm_state = lstm( tf.concat(1, [weighted_context, word_emb]), lstm_state) feature_concate = tf.concat( 1, [lstm_output, weighted_context, word_emb]) output0 = _construct_full_connection_layer(feature_concate, _LAST_FC_DIMENSION_, name='output_fc1') output0 = tf.nn.tanh(output0) output0 = tf.cond(tensor_trainable, lambda: tf.nn.dropout(output0, 0.5), lambda: output0) output = _construct_full_connection_layer(output0, word_number) prob = tf.nn.softmax(output) tensor_output.append(output) tensor_output_prob.append(prob) max_prob_word = tf.argmax(output, 1) word_emb = tf.cond( tensor_trainable, lambda: word_emb, lambda: tf.nn.embedding_lookup(emb_w, max_prob_word)) tf.get_variable_scope().reuse_variables() tensor_output = tf.pack(tensor_output, axis=1) tensor_output_prob = tf.pack(tensor_output_prob, axis=1) return tensor_output, tensor_output_prob
def conv3d(x, w, b, strides=[1, 1, 1, 1, 1], padding="SAME"): X = tf.nn.conv3d(x, w, strides=[1, 1, 1, 1, 1], padding="SAME") x = tf.nn.bias_add(x, b) return tf.relu(x)
inputs = tf.placeholder(tf.int64, [None, s_limit_len], name="inputs") labels = tf.placeholder(tf.int64, [None, n_class], name="labels") keep_prob = tf.placeholder(tf.float32) embedding_W = tf.Variable(tf.float32, [voc_size, embedding_size], name="embedding_w") embedding_layer = tf.nn.embedding_lookup(embedding_W, inputs, name="embedding_layer") #conv1 conv1_w = tf.Variable( tf.truncated_normal([1, embedding_size, 1, filter_nums[1]])) conv1_b = tf.Variable(tf.constant(0.1)) conv1 = tf.relu( tf.nn.conv2d(embedding_layer, conv1_w, [1, 1, 1, 1], padding="VALID")) + conv1_b #conv3 conv3_1w = tf.Variable(tf.truncated_normal([1, embedding_size, 1, 2])) conv3_1b = tf.Variable(tf.constant(0.1)) conv3_1 = tf.relu( tf.nn.conv2d(embedding_layer, conv3_1w, [1, 1, 1, 1], padding="VALID")) + conv3_1b conv3_3w = tf.Variable(tf.truncated_normal([3, embedding_size, 2, 4])) conv3_3b = tf.Variable(tf.constant(0.1)) conv3 = tf.relu( tf.nn.conv2d(conv3_1, conv3_3w, [1, 1, 1, 1], padding="VALID") + conv3_3b) #conv5 conv5_3w = tf.Variable(tf.truncated_normal([3, embedding_size, 2, 4])) conv5_3b = tf.Variable(tf.constant(0.1)) conv5_3 = tf.relu(
def build_predict(self, inputs, reverse_preds=None, embed_penultimate=False, target_subset=None, save_reprs=False): """Construct per-location real-valued predictions.""" assert inputs is not None print('Targets pooled by %d to length %d' % (self.hp.target_pool, self.hp.seq_length // self.hp.target_pool)) ################################################### # convolution layers ################################################### filter_weights = [] layer_reprs = [inputs] seqs_repr = inputs for layer_index in range(self.hp.cnn_layers - 1): with tf.variable_scope('cnn%d' % layer_index, reuse=tf.AUTO_REUSE): # convolution block #seqs_repr = tf.Print(seqs_repr, [tf.shape(seqs_repr)], "{}".format(layer_index)) args_for_block = self._make_conv_block_args( layer_index, layer_reprs) seqs_repr = layers.conv_block(seqs_repr=seqs_repr, **args_for_block) # save representation layer_reprs.append(seqs_repr) if self.hp.multi_head_attention > 0: for i in range(self.hp.multi_head_attention): with tf.variable_scope('multi_head%d' % i, reuse=tf.AUTO_REUSE): seqs_repr = layers.multi_head_attention_block( seqs_repr, is_training=self.is_training, n_query_layers=self.hp.attention_n_query_layers, num_heads=self.hp.attention_num_heads, num_units=self.hp.attention_num_units, decay_variable=self.hp.attention_decay_variable, decay_constant=self.hp.attention_decay_constant, dropout=self.hp.attention_dropout, query_dropout=self.hp.attention_query_dropout, l2_scale=self.hp.attention_l2_scale) elif self.hp.dense_attention > 0: seqs_repr = layers.dense_attention_block( seqs_repr, self.hp.dense_attention, self.is_training, self.hp.attention_decay_variable, self.hp.attention_decay_constant, self.hp.attention_dropout, self.hp.attention_query_dropout, self.hp.attention_l2_scale) elif self.hp.exp: if self.hp.exp_decay_variable > 0: seqs_repr = layers.exp_block_variable( seqs_repr, self.is_training, self.hp.exp_decay_variable) else: seqs_repr = layers.exp_block(seqs_repr, self.is_training, self.hp.exp_decay_constant) layer_reprs.append(seqs_repr) # Final Conv with tf.variable_scope('cnn_final%d' % (self.hp.cnn_layers - 1), reuse=tf.AUTO_REUSE): # convolution block #seqs_repr = tf.Print(seqs_repr, [tf.shape(seqs_repr)], "{}".format(layer_index)) args_for_block = self._make_conv_block_args( self.hp.cnn_layers - 1, layer_reprs) seqs_repr = layers.conv_block(seqs_repr=seqs_repr, **args_for_block) # save representation layer_reprs.append(seqs_repr) if save_reprs: self.layer_reprs = layer_reprs # final nonlinearity seqs_repr = tf.nn.relu(seqs_repr) ################################################### # slice out side buffer ################################################### # update batch buffer to reflect pooling seq_length = seqs_repr.shape[1].value pool_preds = self.hp.seq_length // seq_length assert self.hp.batch_buffer % pool_preds == 0, ( 'batch_buffer %d not divisible' ' by the CNN pooling %d') % (self.hp.batch_buffer, pool_preds) batch_buffer_pool = self.hp.batch_buffer // pool_preds # slice out buffer seq_length = seqs_repr.shape[1] seqs_repr = seqs_repr[:, batch_buffer_pool:seq_length - batch_buffer_pool, :] seq_length = seqs_repr.shape[1] ################################################### # final layer ################################################### if embed_penultimate: final_repr = seqs_repr else: with tf.variable_scope('final', reuse=tf.AUTO_REUSE): final_filters = self.hp.num_targets * self.hp.target_classes final_repr = tf.layers.dense( inputs=seqs_repr, units=final_filters, activation=None, kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_in'), kernel_regularizer=tf.contrib.layers.l1_regularizer( self.hp.final_l1_scale)) print('Convolution w/ %d %dx1 filters to final targets' % (final_filters, seqs_repr.shape[2])) if target_subset is not None: # get convolution parameters filters_full = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, 'final/dense/kernel')[0] bias_full = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, 'final/dense/bias')[0] # subset to specific targets filters_subset = tf.gather(filters_full, target_subset, axis=1) bias_subset = tf.gather(bias_full, target_subset, axis=0) # substitute a new limited convolution final_repr = tf.tensordot(seqs_repr, filters_subset, 1) final_repr = tf.nn.bias_add(final_repr, bias_subset) # expand length back out if self.hp.target_classes > 1: final_repr = tf.reshape( final_repr, (-1, seq_length, self.hp.num_targets, self.hp.target_classes)) # transform for reverse complement if reverse_preds is not None: final_repr = tf.cond(reverse_preds, lambda: tf.reverse(final_repr, axis=[1]), lambda: final_repr) ################################################### # link function ################################################### if embed_penultimate: predictions = final_repr else: # work-around for specifying my own predictions # self.preds_adhoc = tf.placeholder( # tf.float32, shape=final_repr.shape, name='preds-adhoc') # float 32 exponential clip max exp_max = 50 # choose link if self.hp.link in ['identity', 'linear']: predictions = tf.identity(final_repr, name='preds') elif self.hp.link == 'relu': predictions = tf.relu(final_repr, name='preds') elif self.hp.link == 'exp': final_repr_clip = tf.clip_by_value(final_repr, -exp_max, exp_max) predictions = tf.exp(final_repr_clip, name='preds') elif self.hp.link == 'exp_linear': predictions = tf.where( final_repr > 0, final_repr + 1, tf.exp(tf.clip_by_value(final_repr, -exp_max, exp_max)), name='preds') elif self.hp.link == 'softplus': final_repr_clip = tf.clip_by_value(final_repr, -exp_max, 10000) predictions = tf.nn.softplus(final_repr_clip, name='preds') else: print('Unknown link function %s' % self.hp.link, file=sys.stderr) exit(1) # clip if self.hp.target_clip is not None: predictions = tf.clip_by_value(predictions, 0, self.hp.target_clip) # sqrt if self.hp.target_sqrt: predictions = tf.sqrt(predictions) return predictions
import tensorflow as tf import numpy as np images = tf.placeholder(tf.float32, [None, 256, 256, 1]) # Conv 1 filters1_1 = tf.Variable(tf.truncated_normal([3, 3, 1, 64])) bias1_1 = tf.Variable(tf.constant(0.1, shape=[256, 256, 64])) conv1_1 = tf.relu( tf.nn.conv2d(images, filters1_1, strides=[1, 1, 1, 1], padding='SAME') + bias1_1) filters1_2 = tf.Variable(tf.truncated_normal([3, 3, 1, 64])) bias1_2 = tf.Variable(tf.constant(0.1, shape=[128, 128, 64])) conv1_2 = tf.relu( tf.nn.conv2d(conv1_1, filters1_2, strides=[2, 2, 1, 1], padding='SAME') + bias1_2) # Conv 2 filters2_1 = tf.Variable(tf.truncated_normal([3, 3, 1, 128])) bias2_1 = tf.Variable(tf.constant(0.1, shape=[128, 128, 128])) conv2_1 = tf.relu( tf.nn.conv2d(conv1_2, filters2_1, strides=[1, 1, 1, 1], padding='SAME') + bias2_1) filters2_2 = tf.Variable(tf.truncated_normal([3, 3, 1, 128])) bias2_2 = tf.Variable(tf.constant(0.1, shape=[64, 64, 128])) conv2_2 = tf.relu( tf.nn.conv2d(conv2_1, filters2_2, strides=[2, 2, 1, 1], padding='SAME') + bias2_2)
def build_loss(self, seqs_repr, data_ops, target_subset=None): """Convert per-location real-valued predictions to a loss.""" # targets tstart = self.batch_buffer // self.target_pool tend = (self.seq_length - self.batch_buffer) // self.target_pool self.target_length = tend - tstart targets = data_ops['label'] targets = tf.identity(targets[:, tstart:tend, :], name='targets_op') if target_subset is not None: targets = tf.gather(targets, target_subset, axis=2) # work-around for specifying my own predictions self.preds_adhoc = tf.placeholder(tf.float32, shape=seqs_repr.shape, name='preds-adhoc') # choose link if self.link in ['identity', 'linear']: self.preds_op = tf.identity(seqs_repr, name='preds') elif self.link == 'relu': self.preds_op = tf.relu(seqs_repr, name='preds') elif self.link == 'exp': self.preds_op = tf.exp(tf.clip_by_value(seqs_repr, -50, 50), name='preds') elif self.link == 'exp_linear': self.preds_op = tf.where(seqs_repr > 0, seqs_repr + 1, tf.exp( tf.clip_by_value(seqs_repr, -50, 50)), name='preds') elif self.link == 'softplus': self.preds_op = tf.nn.softplus(tf.clip_by_value( seqs_repr, -50, 50), name='preds') elif self.link == 'softmax': # performed in the loss function, but saving probabilities self.preds_prob = tf.nn.softmax(seqs_repr, name='preds') else: print('Unknown link function %s' % self.link, file=sys.stderr) exit(1) # clip if self.target_clip is not None: self.preds_op = tf.clip_by_value(self.preds_op, 0, self.target_clip) targets = tf.clip_by_value(targets, 0, self.target_clip) # sqrt if self.target_sqrt: self.preds_op = tf.sqrt(self.preds_op) targets = tf.sqrt(targets) loss_op = None loss_adhoc = None loss_name = self.loss # choose loss if loss_name == 'gaussian': loss_op = tf.squared_difference(self.preds_op, targets) loss_adhoc = tf.squared_difference(self.preds_adhoc, targets) elif loss_name == 'poisson': loss_op = tf.nn.log_poisson_loss(targets, tf.log(self.preds_op), compute_full_loss=True) loss_adhoc = tf.nn.log_poisson_loss(targets, tf.log(self.preds_adhoc), compute_full_loss=True) elif loss_name == 'gamma': # jchan document loss_op = targets / self.preds_op + tf.log(self.preds_op) loss_adhoc = targets / self.preds_adhoc + tf.log(self.preds_adhoc) elif loss_name == 'cross_entropy': loss_op = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=(targets - 1), logits=self.preds_op) loss_adhoc = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=(targets - 1), logits=self.preds_adhoc) else: raise ValueError('Cannot identify loss function %s' % loss_name) # reduce lossses by batch and position loss_op = tf.reduce_mean(loss_op, axis=[0, 1], name='target_loss') loss_op = tf.check_numerics(loss_op, 'Invalid loss', name='loss_check') loss_adhoc = tf.reduce_mean(loss_adhoc, axis=[0, 1], name='target_loss_adhoc') tf.summary.histogram('target_loss', loss_op) for ti in np.linspace(0, self.num_targets - 1, 10).astype('int'): tf.summary.scalar('loss_t%d' % ti, loss_op[ti]) self.target_losses = loss_op self.target_losses_adhoc = loss_adhoc # fully reduce loss_op = tf.reduce_mean(loss_op, name='loss') loss_adhoc = tf.reduce_mean(loss_adhoc, name='loss_adhoc') # add extraneous terms loss_op += self.weights_regularizers loss_adhoc += self.weights_regularizers # track tf.summary.scalar('loss', loss_op) self.targets_op = targets return loss_op, loss_adhoc
def relu_activation(W, x): return tf.relu(tf.matmul(x,W) + b)