def encoder(sequence, seq_length, repr_dim=100, module='lstm', num_layers=1, reuse=None, residual=False, activation=None, layer_norm=False, name='encoder', dropout=None, is_eval=True, **kwargs): if num_layers == 1: if layer_norm: with tf.variable_scope('layernorm', reuse=False) as vs: vs._reuse = False # HACK num_layernorms = sum(1 for v in vs.global_variables() if 'layernorm' in v.name) sequence = tf.contrib.layers.layer_norm( sequence, scope=str(num_layernorms)) with tf.variable_scope(name, reuse=reuse): if module == 'lstm': out = bi_lstm(repr_dim, sequence, seq_length, **kwargs) if activation: out = activation_from_string(activation)(out) elif module == 'sru': with_residual = sequence.get_shape()[2].value == repr_dim out = bi_sru(repr_dim, sequence, seq_length, with_residual, **kwargs) if activation: out = activation_from_string(activation)(out) elif module == 'rnn': out = bi_rnn( repr_dim, tf.nn.rnn_cell.BasicRNNCell( repr_dim, activation_from_string(activation)), sequence, seq_length, **kwargs) elif module == 'gru': out = bi_rnn(repr_dim, tf.contrib.rnn.GRUBlockCell(repr_dim), sequence, seq_length, **kwargs) if activation: out = activation_from_string(activation)(out) elif module == 'gldr': out = gated_linear_dilated_residual_network( repr_dim, sequence, **kwargs) elif module == 'conv': out = convnet(repr_dim, sequence, 1, activation=activation_from_string(activation), **kwargs) elif module == 'conv_glu': out = gated_linear_convnet(repr_dim, sequence, 1, **kwargs) elif module == 'conv_separable': out = depthwise_separable_convolution( repr_dim, sequence, activation=activation_from_string(activation), **kwargs) elif module == 'dense': out = tf.layers.dense(sequence, repr_dim) if activation: out = activation_from_string(activation)(out) elif module == 'highway': out = highway_network(sequence, num_layers, activation_from_string(activation)) elif module == 'self_attn': outs = [] num_attn = kwargs.get('num_attn_heads', 1) for i in range(num_attn): with tf.variable_scope(str(i)): attn = self_attention(sequence, seq_length, repr_dim=repr_dim, **kwargs) outs.append(attn) out = tf.concat(outs, 2) if num_attn > 1 else outs[0] elif module == 'positional_encoding': out = positional_encoding(sequence, seq_length) else: raise ValueError("Unknown encoder type: %s" % module) if residual: if out.get_shape()[-1].value != sequence.get_shape()[-1].value: logging.error( 'Residual connection only possible if input to sequence encoder %s of type %s has same ' 'dimension (%d) as output (%d).' % (name, module, sequence.get_shape()[-1].value, out.get_shape()[-1].value)) raise RuntimeError() out += sequence if dropout is not None: out = tf.cond( tf.logical_and(tf.greater(dropout, 0.0), tf.logical_not(is_eval)), lambda: tf.nn.dropout( out, 1.0 - dropout, noise_shape=[tf.shape(out)[0], 1, tf.shape(out)[-1]]), lambda: out) else: out = encoder(sequence, seq_length, repr_dim, module, num_layers - 1, reuse, residual, activation, layer_norm, name, dropout=dropout, is_eval=is_eval, **kwargs) out = encoder(out, seq_length, repr_dim, module, 1, reuse, residual, activation, layer_norm, name + str(num_layers - 1), dropout=dropout, is_eval=is_eval, **kwargs) return out
def create_output(self, shared_resources, input_tensors): tensors = TensorPortTensors(input_tensors) with tf.variable_scope( "fast_qa", initializer=tf.contrib.layers.xavier_initializer()): # Some helpers batch_size = tf.shape(tensors.question_length)[0] max_question_length = tf.reduce_max(tensors.question_length) support_mask = misc.mask_for_lengths(tensors.support_length) input_size = shared_resources.embeddings.shape[-1] size = shared_resources.config["repr_dim"] with_char_embeddings = shared_resources.config.get( "with_char_embeddings", False) # set shapes for inputs tensors.emb_question.set_shape([None, None, input_size]) tensors.emb_support.set_shape([None, None, input_size]) emb_question = tensors.emb_question emb_support = tensors.emb_support if with_char_embeddings: # compute combined embeddings [char_emb_question, char_emb_support ] = conv_char_embedding(len(shared_resources.char_vocab), size, tensors.word_chars, tensors.word_char_length, [ tensors.question_batch_words, tensors.support_batch_words ]) emb_question = tf.concat([emb_question, char_emb_question], 2) emb_support = tf.concat([emb_support, char_emb_support], 2) input_size += size # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) # compute encoder features question_features = tf.ones( tf.stack([batch_size, max_question_length, 2])) v_wiqw = tf.get_variable("v_wiq_w", [1, 1, input_size], initializer=tf.constant_initializer(1.0)) wiq_w = tf.matmul(tf.gather(emb_question * v_wiqw, tensors.support2question), emb_support, adjoint_b=True) wiq_w = wiq_w + tf.expand_dims(support_mask, 1) question_binary_mask = tf.gather( tf.sequence_mask(tensors.question_length, dtype=tf.float32), tensors.support2question) wiq_w = tf.reduce_sum( tf.nn.softmax(wiq_w) * tf.expand_dims(question_binary_mask, 2), [1]) # [B, L , 2] support_features = tf.stack([tensors.word_in_question, wiq_w], 2) # highway layer to allow for interaction between concatenated embeddings if with_char_embeddings: with tf.variable_scope("char_embeddings") as vs: emb_question = tf.layers.dense( emb_question, size, name="embeddings_projection") emb_question = highway_network(emb_question, 1) vs.reuse_variables() emb_support = tf.layers.dense(emb_support, size, name="embeddings_projection") emb_support = highway_network(emb_support, 1) keep_prob = 1.0 - shared_resources.config.get("dropout", 0.0) emb_question, emb_support = tf.cond( tensors.is_eval, lambda: (emb_question, emb_support), lambda: (tf.nn.dropout(emb_question, keep_prob, noise_shape= [1, 1, emb_question.get_shape()[-1].value]), tf.nn.dropout(emb_support, keep_prob, noise_shape= [1, 1, emb_question.get_shape()[-1].value]))) # extend embeddings with features emb_question_ext = tf.concat([emb_question, question_features], 2) emb_support_ext = tf.concat([emb_support, support_features], 2) # encode question and support encoder_type = shared_resources.config.get('encoder', 'lstm').lower() if encoder_type in ['lstm', 'sru', 'gru']: size = size + 2 if encoder_type == 'sru' else size # to allow for use of residual in SRU encoded_question = encoder(emb_question_ext, tensors.question_length, size, module=encoder_type) encoded_support = encoder(emb_support_ext, tensors.support_length, size, module=encoder_type, reuse=True) projection_initializer = tf.constant_initializer( np.concatenate([np.eye(size), np.eye(size)])) encoded_question = tf.layers.dense( encoded_question, size, tf.tanh, use_bias=False, kernel_initializer=projection_initializer, name='projection_q') encoded_support = tf.layers.dense( encoded_support, size, tf.tanh, use_bias=False, kernel_initializer=projection_initializer, name='projection_s') else: raise ValueError( "Only rnn ('lstm', 'sru', 'gru') encoder allowed for FastQA!" ) answer_layer = shared_resources.config.get('answer_layer', 'conditional').lower() topk = tf.get_variable('topk', initializer=shared_resources.config.get( 'topk', 1), dtype=tf.int32, trainable=False) topk_p = tf.placeholder(tf.int32, [], 'beam_size_setter') topk_assign = topk.assign(topk_p) self._topk_assign = lambda k: self.tf_session.run( topk_assign, {topk_p: k}) if answer_layer == 'conditional': start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \ conditional_answer_layer(size, encoded_question, tensors.question_length, encoded_support, tensors.support_length, tensors.correct_start, tensors.support2question, tensors.answer2support, tensors.is_eval, topk=topk, max_span_size=shared_resources.config.get("max_span_size", 10000)) elif answer_layer == 'conditional_bilinear': start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \ conditional_answer_layer(size, encoded_question, tensors.question_length, encoded_support, tensors.support_length, tensors.correct_start, tensors.support2question, tensors.answer2support, tensors.is_eval, topk=topk, max_span_size=shared_resources.config.get("max_span_size", 10000), bilinear=True) elif answer_layer == 'bilinear': start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \ bilinear_answer_layer(size, encoded_question, tensors.question_length, encoded_support, tensors.support_length, tensors.support2question, tensors.answer2support, tensors.is_eval, topk=topk, max_span_size=shared_resources.config.get("max_span_size", 10000)) else: raise ValueError span = tf.stack( [doc_idx, predicted_start_pointer, predicted_end_pointer], 1) return TensorPort.to_mapping(self.output_ports, (start_scores, end_scores, span))