def _factorized_reduction(self, x, out_filters, stride, is_training): """Reduces the shape of x without information loss due to striding.""" assert out_filters % 2 == 0, ( "Need even number of filters when using this factorized reduction." ) if stride == 1: with tf.variable_scope("path_conv"): x = _conv_opt(x, 1, out_filters) x = batch_norm(x, is_training) return x actual_data_format = "channels_first" # only support NCHW #stride_spec = self._get_strides(stride) # Skip path 1 path1 = tf.layers.max_pooling1d(x, 1, stride, "VALID", data_format=actual_data_format) #path1 = tf.nn.max_pool(x, [1, 1, 1], stride_spec, "VALID", data_format="NCHW") #print("after max_pool:", path1.shape) with tf.variable_scope("path1_conv"): path1 = _conv_opt(path1, 1, out_filters // 2) print("after conv:", path1.shape) # Skip path 2 # First pad with 0"s on the right and bottom, then shift the filter to # include those 0"s that were added. pad_arr = [[0, 0], [0, 0], [0, 1]] path2 = tf.pad(x, pad_arr)[:, :, 1:] inp_c = path2.get_shape()[1].value if inp_c > 1: concat_axis = 1 else: concat_axis = 2 path2 = tf.layers.max_pooling1d(path2, 1, stride, "VALID", data_format=actual_data_format) #path2 = tf.nn.max_pool(path2, [1, 1, 1], stride_spec, "VALID", data_format=self.data_format) with tf.variable_scope("path2_conv"): path2 = _conv_opt(path2, 1, out_filters // 2) # Concat and apply BN final_path = tf.concat(values=[path1, path2], axis=concat_axis) final_path = batch_norm(final_path, is_training) return final_path
def post_process_out(inputs, out): '''Form skip connection and perform batch norm''' optional_inputs = inputs[1] print("post_process_out::", inputs, optional_inputs) with tf.variable_scope(get_layer_id()): with tf.variable_scope("skip"): #print("layers",layers) inputs = layers[-1] inp_d = inputs.get_shape()[1].value inp_l = inputs.get_shape()[2].value out.set_shape([None, out_filters, inp_l]) try: out = tf.add_n( [out, tf.reduce_sum(optional_inputs, axis=0)]) except Exception as e: print(e) out = batch_norm(out, is_training) layers.append(out) return out
def _model(self, doc, bow_doc, datasets, is_training, reuse=False, mode="train"): with tf.variable_scope(self.name, reuse=reuse): layers = [] if is_training: self.valid_lengths = [] with tf.variable_scope('embed'): regularizer = tf.contrib.layers.l2_regularizer( scale=self.l2_reg) if self.embedding_model == "none": embedding = create_weight( "w", shape=self.embedding["none"].shape, trainable=True, initializer=tf.truncated_normal_initializer, regularizer=regularizer) elif self.embedding_model == "glove": embedding = create_weight( "w", shape=None, trainable=True, initializer=self.embedding["glove"], regularizer=regularizer) elif self.embedding_model == "word2vec": embedding = create_weight( "w", shape=None, trainable=True, initializer=self.embedding["word2vec"], regularizer=regularizer) elif self.embedding_model == "all": embedding_glove = create_weight( "w_glove", shape=None, trainable=True, initializer=self.embedding["glove"], regularizer=regularizer) print("embedding_glove: {0}".format( embedding_glove.get_shape())) embedding_word2vec = create_weight( "w_word2vec", shape=None, trainable=True, initializer=self.embedding["word2vec"], regularizer=regularizer) print("embedding_word2vec: {0}".format( embedding_word2vec.get_shape())) embedding = tf.concat( [embedding_glove, embedding_word2vec], axis=0) print("join embedding: {0}".format(embedding.get_shape())) field_embedding = create_weight( "w_field", shape=self.embedding["field"].shape, trainable=True, initializer=tf.truncated_normal_initializer, regularizer=regularizer) self.final_embedding = embedding print("embedding: {0}".format(embedding)) print("doc: {0}".format(doc)) print("bow_doc: {0}".format(bow_doc)) if is_training or mode == "valid": batch_size = self.batch_size else: batch_size = self.eval_batch_size if self.sliding_window: doc, sliding_windows = self._to_sliding_window(doc, batch_size, size=64, step=32) bow_doc, _ = self._to_sliding_window(bow_doc, batch_size, size=64, step=32) print("doc after sliding window: {0}".format(doc)) if is_training: embedding = tf.nn.dropout(embedding, keep_prob=self.embed_keep_prob) doc = tf.nn.embedding_lookup(embedding, doc, max_norm=None) field_embedding = tf.nn.embedding_lookup(field_embedding, bow_doc, max_norm=None) if self.input_field_embedding: doc = tf.add_n([doc, field_embedding]) doc = tf.transpose(doc, [0, 2, 1]) print("doc_shape", doc.shape) inp_c = doc.shape[1] inp_w = doc.shape[2] #doc = tf.reshape(doc, [-1, inp_c, 1, inp_w]) doc = tf.reshape(doc, [-1, inp_c, inp_w]) field_embedding = tf.transpose(field_embedding, [0, 2, 1]) #field_embedding = tf.reshape(field_embedding, [-1, inp_c, 1, inp_w]) field_embedding = tf.reshape(field_embedding, [-1, inp_c, inp_w]) print("after: doc, field_embedding", doc.shape, field_embedding.shape) x = doc pos_batch_size = 1 # initialize pos_embedding for transformer if self.input_positional_encoding: out_filters = 300 else: out_filters = self.out_filters if self.is_sinusolid: pos_embedding = self._positional_encoding( x, pos_batch_size, is_training, num_units=out_filters, zero_pad=False, scale=False, scope="enc_pe") else: pos_embedding = self._embedding(tf.tile( tf.expand_dims(tf.range(inp_w), 0), [pos_batch_size, 1]), vocab_size=inp_w, num_units=out_filters, reuse=tf.AUTO_REUSE, zero_pad=True, scale=False, scope="enc_pe") print("pos embedding: {0}".format(pos_embedding)) pos_embedding = tf.transpose(pos_embedding, [0, 2, 1]) #pos_embedding = tf.expand_dims(pos_embedding, axis=2) print("pos embedding: {0}".format(pos_embedding)) if self.input_positional_encoding: x += pos_embedding out_filters = self.out_filters with tf.variable_scope("init_conv"): # adjust out_filter dimension #print("init_x", x.shape) x = _conv_opt(x, 1, self.out_filters) x = batch_norm(x, is_training) layers.append(x) # sveral operations for nni def add_fixed_pooling_layer(layer_id, layers, out_filters, is_training, pos_embedding, field_embedding): '''Add a fixed pooling layer every four layers''' with tf.variable_scope("pos_embed_pool_{0}".format(layer_id)): pos_embedding = self._factorized_reduction( pos_embedding, out_filters, 2, is_training) with tf.variable_scope( "field_embed_pool_{0}".format(layer_id)): field_embedding = self._factorized_reduction( field_embedding, out_filters, 2, is_training) #out_filters *= 2 with tf.variable_scope("pool_at_{0}".format(layer_id)): pooled_layers = [] for i, layer in enumerate(layers): #print("pooling_layer", i, layer) with tf.variable_scope("from_{0}".format(i)): x = self._factorized_reduction( layer, out_filters, 2, is_training) #print("after x ", x) pooled_layers.append(x) layers = pooled_layers return layers, out_filters def post_process_out(inputs, out): '''Form skip connection and perform batch norm''' optional_inputs = inputs[1] print("post_process_out::", inputs, optional_inputs) with tf.variable_scope(get_layer_id()): with tf.variable_scope("skip"): #print("layers",layers) inputs = layers[-1] inp_d = inputs.get_shape()[1].value inp_l = inputs.get_shape()[2].value out.set_shape([None, out_filters, inp_l]) try: out = tf.add_n( [out, tf.reduce_sum(optional_inputs, axis=0)]) except Exception as e: print(e) out = batch_norm(out, is_training) layers.append(out) return out global layer_id layer_id = -1 def get_layer_id(): global layer_id layer_id += 1 return 'layer_' + str(layer_id) size = [1, 3, 5, 7] separables = [False, False, False, False] def conv(inputs, size, separable=False): # res_layers is pre_layers that are chosen to form skip connection # layers[-1] is always the latest input with tf.variable_scope(get_layer_id()): with tf.variable_scope('conv_' + str(size) + ( '_separable' if separable else '')): #print("conv_inputs::", inputs) dealed_inputs = tf.reduce_sum(inputs[1], axis=0) #print("dealed_inputs::", dealed_inputs) out = conv_op(dealed_inputs, size, is_training, out_filters, out_filters, start_idx=None, separable=separable) #layers.append(out) return out def pool(inputs, ptype): assert ptype in ['avg', 'max'], "pooling type must be avg or max" with tf.variable_scope(get_layer_id()): with tf.variable_scope('pooling_' + str(ptype)): #print("pool_inputs::", inputs) dealed_inputs = tf.reduce_sum(inputs[1], axis=0) #print("dealed_inputs::", dealed_inputs) out = pool_op(dealed_inputs, is_training, out_filters, out_filters, ptype, start_idx=None) #layers.append(out) return out def rnn(inputs): with tf.variable_scope(get_layer_id()): with tf.variable_scope('branch_6'): #print("rnn_inputs::", inputs) dealed_inputs = tf.reduce_sum(inputs[1], axis=0) #print("dealed_inputs::", dealed_inputs) out = recur_op(dealed_inputs, is_training, out_filters, out_filters, start_idx=0, lstm_x_keep_prob=self.lstm_x_keep_prob, lstm_h_keep_prob=self.lstm_h_keep_prob, lstm_o_keep_prob=self.lstm_o_keep_prob, var_rec=self.var_rec) #layers.append(out) return out def attention(inputs): with tf.variable_scope(get_layer_id()): with tf.variable_scope('branch_7'): #print("attention_inputs::", inputs) dealed_inputs = tf.reduce_sum(inputs[1], axis=0) #print("dealed_inputs::", dealed_inputs) out = attention_op( dealed_inputs, pos_embedding, field_embedding, is_training, out_filters, out_filters, start_idx=0, positional_encoding=self.positional_encoding, attention_keep_prob=self.attention_keep_prob, do_field_embedding=self.field_embedding) #layers.append(out) return out def final_process(inputs): with tf.variable_scope(get_layer_id()): with tf.variable_scope('final_out'): print("final_inputs::", inputs) dealed_inputs = tf.reduce_mean(inputs[1], axis=0) print("dealed_inputs::", dealed_inputs) out = dealed_inputs #out = tf.reduce_mean(inputs[1], axis=0) print("final_out::", inputs, out) layers.append(out) return out """@nni.mutable_layers( { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x], optional_input_size: 1, layer_output: layer_0_out_0 }, { layer_choice: [post_process_out(out=layer_0_out_0)], optional_inputs: [], optional_input_size: 1, layer_output: layer_0_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out], optional_input_size: 1, layer_output: layer_1_out_0 }, { layer_choice: [post_process_out(out=layer_1_out_0)], optional_inputs: [layer_0_out], optional_input_size: 1, layer_output: layer_1_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out], optional_input_size: 1, layer_output: layer_2_out_0 }, { layer_choice: [post_process_out(out=layer_2_out_0)], optional_inputs: [layer_0_out, layer_1_out], optional_input_size: 1, layer_output: layer_2_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out], optional_input_size: 1, layer_output: layer_3_out_0 }, { layer_choice: [post_process_out(out=layer_3_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out], optional_input_size: 1, layer_output: layer_3_out } )""" layers, out_filters = add_fixed_pooling_layer( 3, layers, out_filters, is_training, pos_embedding, field_embedding) x, layer_0_out, layer_1_out, layer_2_out, layer_3_out = layers[-5:] print("layer_out", x, layer_0_out, layer_1_out, layer_2_out, layer_3_out) """@nni.mutable_layers( { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out], optional_input_size: 1, layer_output: layer_4_out_0 }, { layer_choice: [post_process_out(out=layer_4_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out], optional_input_size: 1, layer_output: layer_4_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out], optional_input_size: 1, layer_output: layer_5_out_0 }, { layer_choice: [post_process_out(out=layer_5_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out], optional_input_size: 1, layer_output: layer_5_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out], optional_input_size: 1, layer_output: layer_6_out_0 }, { layer_choice: [post_process_out(out=layer_6_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out], optional_input_size: 1, layer_output: layer_6_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out], optional_input_size: 1, layer_output: layer_7_out_0 }, { layer_choice: [post_process_out(out=layer_7_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out], optional_input_size: 1, layer_output: layer_7_out } )""" layers, out_filters = add_fixed_pooling_layer( 7, layers, out_filters, is_training, pos_embedding, field_embedding) x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out = layers[ -9:] """@nni.mutable_layers( { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out], optional_input_size: 1, layer_output: layer_8_out_0 }, { layer_choice: [post_process_out(out=layer_8_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out], optional_input_size: 1, layer_output: layer_8_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out], optional_input_size: 1, layer_output: layer_9_out_0 }, { layer_choice: [post_process_out(out=layer_9_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out], optional_input_size: 1, layer_output: layer_9_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out], optional_input_size: 1, layer_output: layer_10_out_0 }, { layer_choice: [post_process_out(out=layer_10_out_0)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out], optional_input_size: 1, layer_output: layer_10_out }, { layer_choice: [conv(size=1), conv(size=3), conv(size=5), conv(size=7), pool(ptype='avg'), pool(ptype='max'), rnn(), attention()], optional_inputs: [x, layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out, layer_10_out], optional_input_size: 1, layer_output: layer_11_out_1 }, { layer_choice: [post_process_out(out=layer_11_out_1)], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out, layer_10_out], optional_input_size: 1, layer_output: layer_11_out }, { layer_choice: [final_process()], optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out, layer_10_out, layer_11_out], optional_input_size: 1, layer_output: final_out } )""" print("len_layers: ", len(layers)) x = final_out if self.sliding_window: x = self._from_sliding_window(x, batch_size, sliding_windows) class_num = self.class_num with tf.variable_scope("fc"): if not self.is_output_attention: x = tf.reduce_mean(x, 2) else: batch_size = x.get_shape()[0].value inp_d = x.get_shape()[1].value inp_l = x.get_shape()[2].value final_attention_query = create_weight( "query", shape=[1, inp_d], trainable=True, initializer=tf.truncated_normal_initializer, regularizer=regularizer) if is_training or mode == "valid": batch_size = self.batch_size else: batch_size = self.eval_batch_size final_attention_query = tf.tile(final_attention_query, [batch_size, 1]) print("final_attention_query: {0}".format( final_attention_query)) #put channel to the last dim x = tf.transpose(x, [0, 2, 1]) x = tf.reshape(x, [-1, inp_l, inp_d]) print("x: {0}".format(x)) x = multihead_attention( queries=final_attention_query, keys=x, pos_embedding=pos_embedding, field_embedding=field_embedding, num_units=inp_d, num_heads=8, dropout_rate=0, is_training=is_training, causality=False, positional_encoding=self.positional_encoding) print("x: {0}".format(x)) x = tf.reshape(x, [-1, 1, inp_d]) x = tf.reduce_sum(x, axis=1) print("x: {0}".format(x)) if is_training: x = tf.nn.dropout(x, self.keep_prob) x = tf.layers.dense(x, units=class_num) return x
def _fixed_layer(self, inputs, pos_embedding, field_embedding, layer_id, prev_layers, final_flags, start_idx, pre_idx, out_filters, is_training): """ Args: layer_id: current layer prev_layers: cache of previous layers. for skip connections start_idx: where to start looking at. technically, we can infer this from layer_id, but why bother... is_training: for batch_norm """ if len(prev_layers) > 0: inputs = prev_layers[-1] if len(prev_layers) > 0: if self.multi_path: pre_layer_id = self.sample_arc[start_idx] start_idx += 1 num_pre_layers = len(prev_layers) if num_pre_layers > 5: num_pre_layers = 5 matched = False for i in range(0, num_pre_layers): if pre_layer_id == i: layer_idx = len(prev_layers) - 1 - i final_flags[layer_idx] = 0 matched = True inputs = prev_layers[layer_idx] if not matched: final_flags[-1] = 0 inputs = prev_layers[-1] else: final_flags[-1] = 0 size = [1, 3, 5, 7] separables = [False, False, False, False] actual_data_format = "channels_first" # NCHW out = inputs count = self.sample_arc[start_idx] if count in [0, 1, 2, 3]: filter_size = size[count] separable = separables[count] with tf.variable_scope("conv_{0}x{0}".format(filter_size)): out = tf.nn.relu(out) out = conv_op(out, filter_size, is_training, out_filters, out_filters) out = batch_norm(out, is_training) elif count == 4: with tf.variable_scope("average_pool"): out = pool_op(out, is_training, out_filters, out_filters, "avg") elif count == 5: with tf.variable_scope("max_pool"): out = pool_op(out, is_training, out_filters, out_filters, "max") elif count == 7: with tf.variable_scope("out_attention"): out = attention_op( out, pos_embedding, field_embedding, is_training, out_filters, out_filters, start_idx=0, positional_encoding=self.positional_encoding, attention_keep_prob=self.attention_keep_prob, do_field_embedding=self.field_embedding) out = batch_norm(out, is_training) elif count == 6: with tf.variable_scope("rnn"): out = recur_op(out, is_training, out_filters, out_filters, start_idx=0, lstm_x_keep_prob=self.lstm_x_keep_prob, lstm_h_keep_prob=self.lstm_h_keep_prob, lstm_o_keep_prob=self.lstm_o_keep_prob, var_rec=self.var_rec) else: raise ValueError("Unknown operation number '{0}'".format(count)) if layer_id > 0: skip_start = start_idx + 1 skip = self.sample_arc[skip_start:skip_start + layer_id] total_skip_channels = np.sum(skip) + 1 res_layers = [] for i in range(layer_id): if skip[i] == 1: res_layers.append(prev_layers[i]) final_flags[i] = 0 prev = res_layers + [out] if not self.skip_concat: out = tf.add_n(prev) else: prev = tf.concat(prev, axis=1) out = prev print(out, out_filters) with tf.variable_scope("skip"): out = tf.nn.relu(out) out = conv_op(out, 1, is_training, out_filters, out_filters) out = batch_norm(out, is_training) return out
def _model(self, doc, bow_doc, datasets, is_training, reuse=False, mode="train"): with tf.variable_scope(self.name, reuse=reuse): layers = [] final_flags = [] if is_training: self.valid_lengths = [] with tf.variable_scope('embed'): regularizer = tf.contrib.layers.l2_regularizer( scale=self.l2_reg) if self.embedding_model == "none": embedding = create_weight( "w", shape=self.embedding["none"].shape, trainable=True, initializer=tf.truncated_normal_initializer, regularizer=regularizer) elif self.embedding_model == "glove": embedding = create_weight( "w", shape=None, trainable=True, initializer=self.embedding["glove"], regularizer=regularizer) elif self.embedding_model == "word2vec": embedding = create_weight( "w", shape=None, trainable=True, initializer=self.embedding["word2vec"], regularizer=regularizer) elif self.embedding_model == "all": embedding_glove = create_weight( "w_glove", shape=None, trainable=True, initializer=self.embedding["glove"], regularizer=regularizer) print("embedding_glove: {0}".format( embedding_glove.get_shape())) embedding_word2vec = create_weight( "w_word2vec", shape=None, trainable=True, initializer=self.embedding["word2vec"], regularizer=regularizer) print("embedding_word2vec: {0}".format( embedding_word2vec.get_shape())) embedding = tf.concat( [embedding_glove, embedding_word2vec], axis=0) print("join embedding: {0}".format(embedding.get_shape())) field_embedding = create_weight( "w_field", shape=self.embedding["field"].shape, trainable=True, initializer=tf.truncated_normal_initializer, regularizer=regularizer) self.final_embedding = embedding print("embedding: {0}".format(embedding)) print("doc: {0}".format(doc)) print("bow_doc: {0}".format(bow_doc)) if is_training or mode == "valid": batch_size = self.batch_size else: batch_size = self.eval_batch_size if self.sliding_window: doc, sliding_windows = self._to_sliding_window(doc, batch_size, size=64, step=32) bow_doc, _ = self._to_sliding_window(bow_doc, batch_size, size=64, step=32) print("doc after sliding window: {0}".format(doc)) if is_training: embedding = tf.nn.dropout(embedding, keep_prob=self.embed_keep_prob) doc = tf.nn.embedding_lookup(embedding, doc, max_norm=None) field_embedding = tf.nn.embedding_lookup(field_embedding, bow_doc, max_norm=None) if self.input_field_embedding: doc = tf.add_n([doc, field_embedding]) doc = tf.transpose(doc, [0, 2, 1]) print("doc_shape", doc.shape) inp_c = doc.shape[1] inp_w = doc.shape[2] #doc = tf.reshape(doc, [-1, inp_c, 1, inp_w]) doc = tf.reshape(doc, [-1, inp_c, inp_w]) field_embedding = tf.transpose(field_embedding, [0, 2, 1]) #field_embedding = tf.reshape(field_embedding, [-1, inp_c, 1, inp_w]) field_embedding = tf.reshape(field_embedding, [-1, inp_c, inp_w]) print("after: doc, field_embedding", doc.shape, field_embedding.shape) x = doc pos_batch_size = 1 # initialize pos_embedding for transformer if self.input_positional_encoding: out_filters = 300 else: out_filters = self.out_filters if self.is_sinusolid: pos_embedding = self._positional_encoding( x, pos_batch_size, is_training, num_units=out_filters, zero_pad=False, scale=False, scope="enc_pe") else: pos_embedding = self._embedding(tf.tile( tf.expand_dims(tf.range(inp_w), 0), [pos_batch_size, 1]), vocab_size=inp_w, num_units=out_filters, reuse=tf.AUTO_REUSE, zero_pad=True, scale=False, scope="enc_pe") print("pos embedding: {0}".format(pos_embedding)) pos_embedding = tf.transpose(pos_embedding, [0, 2, 1]) #pos_embedding = tf.expand_dims(pos_embedding, axis=2) print("pos embedding: {0}".format(pos_embedding)) if self.input_positional_encoding: x += pos_embedding out_filters = self.out_filters with tf.variable_scope("init_conv"): # adjust out_filter dimension #print("init_x", x.shape) x = _conv_opt(x, 1, self.out_filters) x = batch_norm(x, is_training) def add_fixed_pooling_layer(layer_id, layers, out_filters, is_training, pos_embedding, field_embedding): '''Add a fixed pooling layer every four layers''' with tf.variable_scope("pos_embed_pool_{0}".format(layer_id)): pos_embedding = self._factorized_reduction( pos_embedding, out_filters, 2, is_training) with tf.variable_scope( "field_embed_pool_{0}".format(layer_id)): field_embedding = self._factorized_reduction( field_embedding, out_filters, 2, is_training) #out_filters *= 2 with tf.variable_scope("pool_at_{0}".format(layer_id)): pooled_layers = [] for i, layer in enumerate(layers): #print("pooling_layer", i, layer) with tf.variable_scope("from_{0}".format(i)): x = self._factorized_reduction( layer, out_filters, 2, is_training) #print("after x ", x) pooled_layers.append(x) layers = pooled_layers return layers, out_filters start_idx = 0 print("xxxxx", x) for layer_id in range(self.num_layers): with tf.variable_scope("layer_{0}".format(layer_id)): print("layers", layers) print("layer_id, x", layer_id, x) x = self._fixed_layer(x, pos_embedding, field_embedding, layer_id, layers, final_flags, start_idx, 0, out_filters, is_training) layers.append(x) if self.fixed_arc is not None: final_flags.append(1) print("sample_arc: {0}".format(self.sample_arc[start_idx])) if layer_id in self.pool_layers: layers, out_filters = add_fixed_pooling_layer( layer_id, layers, out_filters, is_training, pos_embedding, field_embedding) start_idx += 1 + layer_id if self.multi_path: start_idx += 1 print(layers[-1]) print("all_layers:", layers) final_layers = [] final_layers_idx = [] for i in range(0, len(layers)): if self.all_layer_output: if self.num_last_layer_output == 0: final_layers.append(layers[i]) final_layers_idx.append(i) elif i >= max( (len(layers) - self.num_last_layer_output), 0): final_layers.append(layers[i]) final_layers_idx.append(i) elif self.fixed_arc is not None and final_flags[i] == 1: final_layers.append(layers[i]) final_layers_idx.append(i) elif self.fixed_arc is None: final_layers.append(final_flags[i] * layers[i]) if self.fixed_arc is not None: print("final_layers: {0}".format(' '.join( [str(idx) for idx in final_layers_idx]))) if self.fixed_arc is not None and self.output_linear_combine: x = self._linear_combine(final_layers) else: x = tf.add_n(final_layers) if self.sliding_window: x = self._from_sliding_window(x, batch_size, sliding_windows) class_num = self.class_num with tf.variable_scope("fc"): if not self.is_output_attention: x = tf.reduce_mean(x, 2) else: batch_size = x.get_shape()[0].value inp_d = x.get_shape()[1].value inp_l = x.get_shape()[2].value final_attention_query = create_weight( "query", shape=[1, inp_d], trainable=True, initializer=tf.truncated_normal_initializer, regularizer=regularizer) if is_training or mode == "valid": batch_size = self.batch_size else: batch_size = self.eval_batch_size final_attention_query = tf.tile(final_attention_query, [batch_size, 1]) print("final_attention_query: {0}".format( final_attention_query)) # put channel to the last dim x = tf.transpose(x, [0, 2, 1]) x = tf.reshape(x, [-1, inp_l, inp_d]) print("x: {0}".format(x)) x = multihead_attention( queries=final_attention_query, keys=x, pos_embedding=pos_embedding, field_embedding=field_embedding, num_units=inp_d, num_heads=8, dropout_rate=0, is_training=is_training, causality=False, positional_encoding=self.positional_encoding) print("x: {0}".format(x)) x = tf.reshape(x, [-1, 1, inp_d]) x = tf.reduce_sum(x, axis=1) print("x: {0}".format(x)) if is_training: x = tf.nn.dropout(x, self.keep_prob) x = tf.layers.dense(x, units=class_num) return x