def _make_output(self, inputs, params): task = params.get('task', 'classification') task_type = params.get('task_type', 'multiclass') if task == 'classification': logits = tf.contrib.layers.fully_connected(inputs, self._num_classes, activation_fn=None, scope='logits') if task_type == 'multiclass': predictions = tf.cast(tf.argmax(logits, -1), tf.int32) output_score = tf.nn.softmax(logits, -1) elif task_type == 'multilabel': threshold = params.get('threshold', 0.5) output_score = tf.sigmoid(logits) predictions = tf.cast(tf.greater(output_score, threshold), tf.int32) elif task_type == 'topk': output_score = logits #tf.nn.softmax(logits, -1) predictions = tf.cast(tf.greater(logits, 0), tf.int32) #tf.cast(tf.argmax(logits, -1), tf.int32) # # predictions = tf.one_hot(predictions, depth=self._num_classes, axis=-1, dtype=tf.int32) else: raise ConfigureError("Task type %s is not support for task %s. " "Only multiclass and multilabel is support for task %s" % (task_type, task, task)) elif task == 'rank': logits = tf.contrib.layers.fully_connected(inputs, 1, activation_fn=None, scope='logits') predictions = logits output_score = logits else: raise ConfigureError( "Task %s is not support. Only task and classification tasks are supported" % task) output_dict = {'logits': logits, 'predictions': {'predictions': predictions, 'output_score': output_score}} output_score = tf.estimator.export.PredictOutput(output_score) output_predictions = tf.estimator.export.PredictOutput(predictions) export_outputs = {"output_score": output_score} output_dict['export_outputs'] = export_outputs return output_dict
def load_from_files(self, directory): if not os.path.exists(directory): logger.warning("Vocabulary directory %s does not exist.", directory) return False namespaces_file = os.path.join(directory, NAMESPACE_PADDING_FILE) if not os.path.exists(namespaces_file): logger.warning("Vocabulary namespaces file %s does not exist", namespaces_file) return False vocab_filenames = [filename for filename in os.listdir(directory) if filename.startswith(VOCAB_FILE[:6]) and filename.endswith(VOCAB_FILE[-4:])] if len(vocab_filenames) == 0: logger.warning("Vocabulary file %s does not exist") self._non_padded_namespaces = load_from_txt(namespaces_file) for vocab_filename in vocab_filenames: namespace = vocab_filename[6:-4] vocab_namespace_file = os.path.join(directory, vocab_filename) self._namespace_to_path[namespace] = vocab_namespace_file vocab_namespace = load_from_txt(vocab_namespace_file) self._index_to_token[namespace] = dict((index, token) for index, token in enumerate(vocab_namespace)) self._token_to_index[namespace] = dict((token, index) for index, token in enumerate(vocab_namespace)) if self.valid(): return True else: raise ConfigureError("Vocabulary valid error")
def _process(self, example): #example['label'] = example['label'][0] fields: Dict[str, Field] = {} if 'premise' in example: tokenized_premise = self._tokenizer.tokenize(example['premise']) fields["premise"] = TextField(tokenized_premise, self._token_indexers, max_length=self._max_length) if 'hypothesis' in example: tokenized_hypothesis = self._tokenizer.tokenize( example['hypothesis']) fields["hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers, max_length=self._max_length) if 'label' in example: if isinstance(example['label'], list): if self._num_label is None: raise ConfigureError( "the number of labels is not provided for multi-label classification." ) fields['label'] = MultiLabelField(example['label'], num_label=self._num_label) else: fields['label'] = LabelField(example['label']) return Instance(fields)
def pop_choice(self, path, choice, default=None): value = self.pop(path, default) if value not in choice: raise ConfigureError( "value %s get by key %s is not in acceptable choices %s" % (value, path, str(choice))) return value
def init_from_params(cls, params, vocab): config_file = params.pop('config_file', None) if config_file is None: raise ConfigureError( "Please provide ELMo config file for ELMo embedding.") # weight_file = params.pop('weight_file', None) # if weight_file is None: # logger.warning("The ELMo embedding is initialize randomly.") encoder_name = params.pop("encoder_name", "elmo") vocab_namespace = params.pop('namespace', 'elmo_characters') dropout_rate = params.pop_float('dropout_rate', 0.0) ckpt_to_initialize_from = params.pop('ckpt_to_initialize_from', None) weight_file = params.pop('weight_file', None) if ckpt_to_initialize_from is None and weight_file is None: logger.warning("The ELMo embedding is initialize randomly.") # tmp_dir = params.pop('tmp_dir', None) # if tmp_dir is None: # if weight_file: # tmp_dir = os.path.dirname(weight_file) # else: # tmp_dir = "./" params.assert_empty(cls.__name__) return cls(config_file=config_file, ckpt_to_initialize_from=ckpt_to_initialize_from, dropout_rate=dropout_rate, encoder_name=encoder_name, vocab_namespace=vocab_namespace, weight_file=weight_file)
def init_from_params(cls, params, vocab): config_file = params.pop('config_file', None) if config_file is None: raise ConfigureError("Please provide bert config file for bert embedding.") old_vocab_file = params.pop('vocab_file', None) if old_vocab_file is None: logger.warning("The vocab file is not provided. We consider the embedding vocab is the same as the data " "vocab acquiescently.") ckpt_to_initialize_from = params.pop('ckpt_to_initialize_from', None) if ckpt_to_initialize_from is None: logger.warning("The bert embedding is initialize randomly.") num_oov_buckets = params.pop_int("num_oov_buckets", 0) use_one_hot_embeddings = params.pop_bool("use_one_hot_embeddings", False) encoder_name = params.pop("encoder_name", "bert") vocab_namespace = params.pop("namespace", 'tokens') mask_namespace = params.pop("mask_namespace", None) new_vocab_file = vocab.get_vocab_path(vocab_namespace) new_vocab_size = vocab.get_vocab_size(vocab_namespace) projection_dim = params.pop_int("projection_dim", None) dropout_rate = params.pop_float("dropout_rate", 0.0) remove_bos_eos = params.pop_bool("remove_bos_eos", True) params.assert_empty(cls.__name__) return cls(config_file=config_file, ckpt_to_initialize_from=ckpt_to_initialize_from, new_vocab_file=new_vocab_file, new_vocab_size=new_vocab_size, num_oov_buckets= num_oov_buckets, old_vocab_file=old_vocab_file, vocab_namespace=vocab_namespace, remove_bos_eos = remove_bos_eos, mask_namespace=mask_namespace, projection_dim=projection_dim, dropout_rate=dropout_rate, use_one_hot_embeddings=use_one_hot_embeddings, encoder_name=encoder_name)
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): if token.text is None: raise ConfigureError( 'CharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): # If `text_id` is set on the character token (e.g., if we're using byte encoding), we # will not be using the vocab for this character. if getattr(character, 'text_id', None) is None: counter[self._namespace][character.text] += 1
def _read(self, mode: str): filename = self.get_filename_by_mode(mode) if filename: file_path = os.path.join(self._data_path, filename) if file_path.lower().endswith("jsonl"): if self._field_mapping is None: raise ConfigureError( "field mapping is not provided for jsonl file.") with open(file_path, 'r') as json_file: logger.info("Reading instances from jsonl dataset at: %s", file_path) for line in json_file: fields = json.loads(line) example = {} for (field_tar, field_src) in self._field_mapping.items(): example[field_tar] = fields[field_src] yield self._process(example) # example = {} # example['premise'] = fields['answer'] # example['hypothesis'] = fields['question'] # example['label'] = fields['label'] # yield self._process(example) if file_path.lower().endswith("tsv"): if self._field_mapping is None: raise ConfigureError( "field mapping is not provided for tsv file.") with open(file_path, 'r') as csv_file: logger.info("Reading instances from tsv dataset at: %s", file_path) for line in csv_file: fields = line.strip().split("\t") example = {} for (field_tar, field_src) in self._field_mapping.items(): example[field_tar] = fields[int(field_src)] yield self._process(example) else: return None
def takes_arg(obj, arg: str) -> bool: """ Checks whether the provided obj takes a certain arg. If it's a class, we're really checking whether its constructor does. If it's a function or method, we're checking the object itself. Otherwise, we raise an error. """ if inspect.isclass(obj): signature = inspect.signature(obj.__init__) elif inspect.ismethod(obj) or inspect.isfunction(obj): signature = inspect.signature(obj) else: raise ConfigureError(f"object {obj} is not callable") return arg in signature.parameters
def _make_loss(self, logits, labels, params): task = params.get('task', 'classification') task_type = params.get('task_type', 'multiclass') if task == 'classification': if task_type == 'multiclass': #loss = GHM_Loss().ghm_class_loss(logits=logits, targets=labels) loss = focal_loss(logits=logits, labels=labels) # loss = tf.reduce_mean( # tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)) elif task_type == 'multilabel': loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)) elif task_type == 'topk': loss = multilabel_categorical_crossentropy(labels=labels, logits=logits) else: raise ConfigureError("Task type %s is not support for task %s. " "Only multiclass and multilabel is support for task %s" % (task_type, task, task)) elif task == 'rank': loss = rank_hinge_loss(labels=labels, logits=logits, params=params) else: raise ConfigureError( "Task %s is not support. Only task and classification tasks are supported" % task) return loss
def _read_pretrained_embeddings_text(pretrained_file, embedding_dim, vocab, vocab_namespace): vocab_tokens = vocab.get_vocab_tokens(vocab_namespace) vocab_size = vocab.get_vocab_size(vocab_namespace) embeddings = {} logger.info("Reading pretrained embeddings from: %s" % pretrained_file) with open(pretrained_file, 'r', encoding='utf-8') as embeddings_file: for line in tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in vocab_tokens: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: ConfigureError( "The embedding_dim or vocabulary does not fit the pretrained embedding." ) all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) embedding_matrix = np.random.normal(embeddings_mean, embeddings_std, (vocab_size, embedding_dim)) embedding_matrix = embedding_matrix.astype(np.float32) num_tokens_found = 0 index_to_tokens = vocab.get_vocab_index_to_token(vocab_namespace) for i in range(vocab_size): token = index_to_tokens[i] if token in embeddings: embedding_matrix[i] = embeddings[token] num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def rank_hinge_loss(labels, logits, params): num_retrieval = params.get('num_retrieval', None) if num_retrieval is None: raise ConfigureError( "The parameter num_retrieval is not assigned or the dataset is not support rank loss." ) margin = params.get('rank_loss_margin', 1.0) labels = tf.argmax(labels, axis=-1) labels = tf.reshape(labels, (-1, num_retrieval)) logits = tf.reshape(logits, (-1, num_retrieval)) label_mask = tf.cast(tf.sign(labels), tf.float32) label_count = tf.reduce_sum(label_mask, axis=-1) y_pos = tf.reduce_sum(label_mask * logits, axis=-1) / label_count y_neg = tf.reduce_sum( (1. - label_mask) * logits, axis=-1) / (num_retrieval - label_count) loss = tf.maximum(0., margin - y_pos + y_neg) loss = tf.reduce_mean(loss) return loss
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary): # TODO(brendanr): Retain the token to index mappings in the vocabulary and remove this # pylint pragma. See: # https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113 # pylint: disable=unused-argument texts = [token.text for token in tokens] texts = [ELMoCharacterMapper.bos_token ] + texts + [ELMoCharacterMapper.eos_token] if any(text is None for text in texts): raise ConfigureError( 'ELMoTokenCharactersIndexer needs a tokenizer ' 'that retains text') return { self._namespace: [ np.array(ELMoCharacterMapper.convert_word_to_char_ids(text), dtype=np.int64) for text in texts ] }
def init_from_params(cls, params, vocab): token_embedder_params = params.pop('encoders', None) if token_embedder_params is not None: token_embedders = [ Encoder.init_from_params(subparams, vocab=vocab) for name, subparams in token_embedder_params.items() ] # if isinstance(token_embedder_params, Dict): # # else: # token_embedders = [ # Encoder.init_from_params(subparams, vocab=vocab) # for subparams in token_embedder_params # ] else: raise ConfigureError("The parameters of embeddings is not provided.") params.assert_empty(cls.__name__) return cls(token_embedders)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary): indices = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): token_indices = np.zeros(self._max_word_length, dtype=np.int64) if token.text is None: raise ConfigureError( 'TokenCharactersIndexer needs a tokenizer that retains text' ) for character_idx, character in enumerate( self._character_tokenizer.tokenize(token.text)): if character_idx >= self._max_word_length: break else: if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index( character.text, self._namespace) token_indices[character_idx] = index indices.append(token_indices) return {self._namespace: indices}
def assert_empty(self, class_name): if self._params: raise ConfigureError( "Extra parameters are provided %s for class %s" % (str(self._params), class_name))
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) h_s, c1 = nn.lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') h_t, c2 = nn.lstm(hypothesis_tokens, self._hidden_dim, seq_len=hyp_seq_lengths, name='hypothesis') lstm_m = MatchLSTMCell(self._hidden_dim, h_s, prem_mask) k_m, _ = tf.nn.dynamic_rnn(lstm_m, h_t, hyp_seq_lengths, dtype=tf.float32) k_valid = select(k_m, hyp_seq_lengths) output_dict = self._make_output(k_valid, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) # metrics['auc'] = tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) premise_outs, c1 = nn.bi_lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') premise_bi = tf.concat(premise_outs, axis=2) premise_bi = premise_bi * prem_mask eps = 1e-11 ### Mean pooling premise_sum = tf.reduce_sum(premise_bi, 1) premise_ave = tf.div( premise_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1) + eps) # MLP layer h_mlp = tf.contrib.layers.fully_connected(premise_ave, self._hidden_dim, scope='fc1') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp, self._dropout_rate, training=is_training) # Get prediction output_dict = self._make_output(h_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) metrics['map'] = tf.metrics.average_precision_at_k( labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], k=2) metrics['precision_1'] = tf.metrics.precision_at_k( labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], k=1, class_id=1) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 # prem_mask = tf.expand_dims(prem_mask, -1) # hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) with tf.variable_scope("Attend"): F_a_bar = self._feedForwardBlock(premise_tokens, self._hidden_dim, 'F', is_training=is_training) F_b_bar = self._feedForwardBlock(hypothesis_tokens, self._hidden_dim, 'F', isReuse=True, is_training=is_training) # e_i,j = F'(a_hat, b_hat) = F(a_hat).T * F(b_hat) (1) #alignment_attention = Attention(self.hidden_size, self.hidden_size) #alpha = alignment_attention(F_b_bar, F_a_bar, keys_mask=self.query_mask) #beta = alignment_attention(F_a_bar, F_b_bar, keys_mask=self.doc_mask) alpha, beta = nn.bi_uni_attention(F_a_bar, F_b_bar, query_len=prem_seq_lengths, key_len=hyp_seq_lengths) with tf.variable_scope("Compare"): a_beta = tf.concat([premise_tokens, alpha], axis=2) b_alpha = tf.concat([hypothesis_tokens, beta], axis=2) # v_1,i = G([a_bar_i, beta_i]) # v_2,j = G([b_bar_j, alpha_j]) (3) v_1 = self._feedForwardBlock(a_beta, self._hidden_dim, 'G', is_training=is_training) v_2 = self._feedForwardBlock(b_alpha, self._hidden_dim, 'G', isReuse=True, is_training=is_training) with tf.variable_scope("Aggregate"): # v1 = \sum_{i=1}^l_a v_{1,i} # v2 = \sum_{j=1}^l_b v_{2,j} (4) v1_sum = tf.reduce_sum(v_1, axis=1) v2_sum = tf.reduce_sum(v_2, axis=1) # y_hat = H([v1, v2]) (5) v = tf.concat([v1_sum, v2_sum], axis=1) ff_outputs = self._feedForwardBlock(v, self._hidden_dim, 'H', is_training=is_training) output_dict = self._make_output(ff_outputs, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) #metrics['auc'] = tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [tf.shape(hypothesis_tokens), tf.shape(premise_tokens), # tf.shape(alpha), tf.shape(beta)] return output_dict
def forward(self, features, labels, mode, params): eps = 1e-12 features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) s = self._max_length d0 = premise_tokens.get_shape()[2] # zero padding to inputs for wide convolution def pad_for_wide_conv(x): return tf.pad( x, np.array([[0, 0], [0, 0], [self._kernel_size - 1, self._kernel_size - 1], [0, 0]]), "CONSTANT", name="pad_wide_conv") def cos_sim(v1, v2): norm1 = tf.sqrt(tf.reduce_sum(tf.square(v1), axis=1)) norm2 = tf.sqrt(tf.reduce_sum(tf.square(v2), axis=1)) dot_products = tf.reduce_sum(v1 * v2, axis=1, name="cos_sim") return dot_products / (norm1 * norm2 + eps) def make_attention_mat(x1, x2): # x1, x2 = [batch, height, width, 1] = [batch, d, s, 1] # x2 => [batch, height, 1, width] # [batch, width, wdith] = [batch, s, s] euclidean = tf.sqrt( tf.reduce_sum(tf.square(x1 - tf.matrix_transpose(x2)), axis=1) + eps) return 1.0 / (1.0 + euclidean) def convolution(name_scope, x, d, reuse): with tf.name_scope(name_scope + "-conv"): with tf.variable_scope("conv") as scope: conv = tf.contrib.layers.conv2d( inputs=x, num_outputs=self._hidden_dim, kernel_size=(d, self._kernel_size), stride=1, padding="VALID", activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers. xavier_initializer_conv2d(), #weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg), biases_initializer=tf.constant_initializer(1e-04), reuse=reuse, trainable=True, scope=scope) # Weight: [filter_height, filter_width, in_channels, out_channels] # output: [batch, 1, input_width+filter_Width-1, out_channels] == [batch, 1, s+w-1, di] # [batch, di, s+w-1, 1] conv_trans = tf.transpose(conv, [0, 3, 2, 1], name="conv_trans") return conv_trans def w_pool(variable_scope, x, attention): # x: [batch, di, s+w-1, 1] # attention: [batch, s+w-1] with tf.variable_scope(variable_scope + "-w_pool"): if self._model_type == "ABCNN2" or self._model_type == "ABCNN3": pools = [] # [batch, s+w-1] => [batch, 1, s+w-1, 1] attention = tf.transpose( tf.expand_dims(tf.expand_dims(attention, -1), -1), [0, 2, 1, 3]) for i in range(s): # [batch, di, w, 1], [batch, 1, w, 1] => [batch, di, 1, 1] pools.append( tf.reduce_sum( x[:, :, i:i + self._kernel_size, :] * attention[:, :, i:i + self._kernel_size, :], axis=2, keep_dims=True)) # [batch, di, s, 1] w_ap = tf.concat(pools, axis=2, name="w_ap") else: w_ap = tf.layers.average_pooling2d( inputs=x, # (pool_height, pool_width) pool_size=(1, self._kernel_size), strides=1, padding="VALID", name="w_ap") # [batch, di, s, 1] return w_ap def all_pool(variable_scope, x): with tf.variable_scope(variable_scope + "-all_pool"): if variable_scope.startswith("input"): pool_width = s d = d0 else: pool_width = s + self._kernel_size - 1 d = self._hidden_dim all_ap = tf.layers.average_pooling2d( inputs=x, # (pool_height, pool_width) pool_size=(1, pool_width), strides=1, padding="VALID", name="all_ap") # [batch, di, 1, 1] # [batch, di] all_ap_reshaped = tf.reshape(all_ap, [-1, d]) # all_ap_reshaped = tf.squeeze(all_ap, [2, 3]) return all_ap_reshaped def CNN_layer(variable_scope, x1, x2, d): # x1, x2 = [batch, d, s, 1] with tf.variable_scope(variable_scope): if self._model_type == "ABCNN1" or self._model_type == "ABCNN3": with tf.name_scope("att_mat"): aW = tf.get_variable( name="aW", shape=(s, d), initializer=tf.contrib.layers. xavier_initializer(), #regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg) ) # [batch, s, s] att_mat = make_attention_mat(x1, x2) # [batch, s, s] * [s,d] => [batch, s, d] # matrix transpose => [batch, d, s] # expand dims => [batch, d, s, 1] x1_a = tf.expand_dims( tf.matrix_transpose( tf.einsum("ijk,kl->ijl", att_mat, aW)), -1) x2_a = tf.expand_dims( tf.matrix_transpose( tf.einsum("ijk,kl->ijl", tf.matrix_transpose(att_mat), aW)), -1) # [batch, d, s, 2] x1 = tf.concat([x1, x1_a], axis=3) x2 = tf.concat([x2, x2_a], axis=3) left_conv = convolution(name_scope="left", x=pad_for_wide_conv(x1), d=d, reuse=False) right_conv = convolution(name_scope="right", x=pad_for_wide_conv(x2), d=d, reuse=True) left_attention, right_attention = None, None if self._model_type == "ABCNN2" or self._model_type == "ABCNN3": # [batch, s+w-1, s+w-1] att_mat = make_attention_mat(left_conv, right_conv) # [batch, s+w-1], [batch, s+w-1] left_attention, right_attention = tf.reduce_sum( att_mat, axis=2), tf.reduce_sum(att_mat, axis=1) left_wp = w_pool(variable_scope="left", x=left_conv, attention=left_attention) left_ap = all_pool(variable_scope="left", x=left_conv) right_wp = w_pool(variable_scope="right", x=right_conv, attention=right_attention) right_ap = all_pool(variable_scope="right", x=right_conv) return left_wp, left_ap, right_wp, right_ap x1_expanded = tf.expand_dims( tf.transpose(premise_tokens, [0, 2, 1]), -1) x2_expanded = tf.expand_dims( tf.transpose(hypothesis_tokens, [0, 2, 1]), -1) LO_0 = all_pool(variable_scope="input-left", x=x1_expanded) RO_0 = all_pool(variable_scope="input-right", x=x2_expanded) LI_1, LO_1, RI_1, RO_1 = CNN_layer(variable_scope="CNN-1", x1=x1_expanded, x2=x2_expanded, d=d0) sims = [cos_sim(LO_0, RO_0), cos_sim(LO_1, RO_1)] #if self._num_layers > 1: for i in range(1, self._num_layers): _, LO_2, _, RO_2 = CNN_layer(variable_scope="CNN-2", x1=LI_1, x2=RI_1, d=self._hidden_dim) # self.test = LO_2 # self.test2 = RO_2 sims.append(cos_sim(LO_2, RO_2)) with tf.variable_scope("output-layer"): output_features = tf.concat([tf.stack(sims, axis=1)], axis=1, name="output_features") output_dict = self._make_output(output_features, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) metrics['auc'] = tf.metrics.auc( labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) #########Word Embedding#################### premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_ins = [] hypothesis_ins = [] premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) premise_ins.append(premise_tokens) hypothesis_ins.append(hypothesis_tokens) premise_chars = features_embedding.get('premise/chars', None) hypothesis_chars = features_embedding.get('hypothesis/chars', None) if premise_chars is not None and hypothesis_chars is not None: with tf.variable_scope("conv") as scope: conv_pre = nn.multi_conv1d_max( premise_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, self._dropout_rate, scope='conv') scope.reuse_variables() conv_hyp = nn.multi_conv1d_max( hypothesis_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, self._dropout_rate, scope='conv') # conv_pre = tf.reshape(conv_pre, [-1, self.sequence_length, config.char_out_size]) # conv_hyp = tf.reshape(conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_ins.append(conv_pre) hypothesis_ins.append(conv_hyp) premise_pos = features_embedding.get('premise/pos_tags', None) hypothesis_pos = features_embedding.get('hypothesis/pos_tags', None) if premise_pos is not None and hypothesis_pos is not None: premise_ins.append(premise_pos) hypothesis_ins.append(hypothesis_pos) premise_exact_match = features.get('premise/exact_match_labels', None) hypothesis_exact_match = features.get( 'hypothesis/exact_match_labels', None) if premise_exact_match is not None and hypothesis_exact_match is not None: premise_ins.append( tf.expand_dims(tf.cast(premise_exact_match, tf.float32), -1)) hypothesis_ins.append( tf.expand_dims(tf.cast(hypothesis_exact_match, tf.float32), -1)) premise_in = tf.concat(premise_ins, axis=2) hypothesis_in = tf.concat(hypothesis_ins, axis=2) premise_in = nn.highway_network(premise_in, 2, output_size=self._hidden_dim, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="premise_highway") hypothesis_in = nn.highway_network(hypothesis_in, 2, output_size=self._hidden_dim, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="hypothesis_highway") ########Attention Stack-GRU################ def gru_network(input, input_len, name="gru_network"): with tf.variable_scope(name): gru_input = input for i in range(self._num_rnn_layer): with tf.variable_scope("layer_%s" % i): seq, c1 = nn.gru(gru_input, self._hidden_dim, seq_len=input_len, initializer=self._initializer) gru_input = tf.concat([gru_input, seq], axis=2) return gru_input premise_gru = gru_network(premise_in, prem_seq_lengths, name='premise_gru_network') hypothesis_gru = gru_network(hypothesis_in, hyp_seq_lengths, name='hypothesis_gru_network') premise_gru = premise_gru * prem_mask hypothesis_gru = hypothesis_gru * hyp_mask ######### premise_att = nn.attention_pool(premise_gru, self._hidden_dim, seq_len=prem_seq_lengths, initializer=self._initializer, name='premise_attention_pool') hypothesis_att = nn.attention_pool( hypothesis_gru, self._hidden_dim, seq_len=hyp_seq_lengths, initializer=self._initializer, name='hypothesis_attention_pool') ############Dynamic Re-read Mechanism################ def dynamic_reread(h_seq_a, h_a, h_b, h_a_len, name="dymanic_reread"): with tf.variable_scope(name): h_a_pre = h_a # h_a_pre = nn.highway_layer(h_a, self._hidden_dim, initializer=self._initializer, # scope="h_a_pre_highway") # h_seq_a = nn.highway_layer(h_seq_a, self._hidden_dim, initializer=self._initializer, # scope="h_seq_a_highway") # h_b = nn.highway_layer(h_b, self._hidden_dim, initializer=self._initializer, # scope="h_b_highway") ##### w_d = tf.get_variable( "w_d_weights", (h_seq_a.shape[-1].value, h_a_pre.shape[-1].value), initializer=self._initializer) u_d = tf.get_variable( "u_d_weights", (h_a_pre.shape[-1].value, h_a_pre.shape[-1].value), initializer=self._initializer) m_d = tf.get_variable( "m_d_weights", (h_b.shape[-1].value, h_a_pre.shape[-1].value), initializer=self._initializer) omega_d = tf.get_variable("omega_d_weights", (h_a_pre.shape[-1].value, 1), initializer=self._initializer) ########## m_d_h_b = tf.tensordot(h_b, m_d, axes=[-1, 0]) h_seq_a_w_d = tf.tensordot(h_seq_a, w_d, axes=[-1, 0]) if h_a_len is not None: mask = tf.expand_dims(tf.sequence_mask( h_a_len, tf.shape(h_seq_a)[1], dtype=tf.float32), axis=2) else: mask = None gru_cell = tf.nn.rnn_cell.GRUCell( h_a_pre.shape[-1].value, kernel_initializer=self._initializer) for i in range(self._reread_length): u_d_h_a_pre = tf.tensordot(h_a_pre, u_d, axes=[-1, 0]) m_a = tf.nn.tanh( h_seq_a_w_d + tf.expand_dims(m_d_h_b + u_d_h_a_pre, 1)) m_a = tf.tensordot(m_a, omega_d, axes=[-1, 0]) if mask is not None: m_a = m_a + (1. - mask) * tf.float32.min alpha = tf.nn.softmax(self._beta * m_a, axis=1) alpha = tf.reduce_sum(alpha * h_seq_a, axis=1) gru_output, gru_state = gru_cell(alpha, h_a_pre) h_a_pre = gru_state return gru_output premise_v = dynamic_reread(premise_gru, premise_att, hypothesis_att, prem_seq_lengths, name='premise_dynamic_reread') hypothesis_v = dynamic_reread(hypothesis_gru, hypothesis_att, premise_att, hyp_seq_lengths, name='hypothesis_dynamic_reread') ########label prediction############## h = tf.concat([ premise_att, hypothesis_att, hypothesis_att * premise_att, hypothesis_att - premise_att ], axis=-1) v = tf.concat([ premise_v, hypothesis_v, hypothesis_v * premise_v, hypothesis_v - premise_v ], axis=-1) # h MLP layer h_mlp = tf.layers.dense(h, self._hidden_dim, activation=tf.nn.relu, kernel_initializer=self._initializer, name='h_fc1') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp, self._dropout_rate, training=is_training) # Get prediction h_logits = tf.layers.dense(h_drop, self._num_classes, activation=None, kernel_initializer=self._initializer, name='h_logits') p_h = tf.nn.softmax(h_logits) # # MLP layer v_mlp = tf.layers.dense(v, self._hidden_dim, activation=tf.nn.relu, kernel_initializer=self._initializer, name='v_fc1') # Dropout applied to classifier v_drop = tf.layers.dropout(v_mlp, self._dropout_rate, training=is_training) # Get prediction v_logits = tf.layers.dense(v_drop, self._num_classes, activation=None, kernel_initializer=self._initializer, name='v_logits') p_v = tf.nn.softmax(v_logits) #### alpha_h = tf.layers.dense(h, 1, activation=tf.nn.sigmoid, kernel_initializer=self._initializer, bias_initializer=tf.zeros_initializer()) alpha_v = tf.layers.dense(v, 1, activation=tf.nn.sigmoid, kernel_initializer=self._initializer, bias_initializer=tf.zeros_initializer()) # # h MLP layer fuse_mlp = tf.layers.dense(alpha_h * h + alpha_v * v, self._hidden_dim, activation=tf.nn.relu, kernel_initializer=self._initializer, name='fuse_fc1') # Dropout applied to classifier fuse_drop = tf.layers.dropout(fuse_mlp, self._dropout_rate, training=is_training) #Get prediction output_dict = self._make_output(fuse_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] h_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_embedding, logits=h_logits)) v_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_embedding, logits=v_logits)) fuse_loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = v_loss + h_loss + fuse_loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward(features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get('hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError("The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError("The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get('premise/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get('premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get('hypothesis/elmo_characters', None) # 2.Input Encoder # 2.1 Highway Encoder query_emb = premise_tokens doc_emb = hypothesis_tokens query_len = prem_seq_lengths doc_len = hyp_seq_lengths query_mask = prem_mask doc_mask = hyp_mask project_dim = premise_tokens.shape[-1].value query_length = tf.shape(premise_tokens)[1] doc_length = tf.shape(hypothesis_tokens)[1] query_output = nn.highway_network(query_emb, 1, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="query_highway") doc_output = nn.highway_network(doc_emb, 1, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="doc_highway") # # 2.2 Co-Attention M = tf.Variable(tf.random_normal([project_dim, project_dim], stddev=0.1)) tmp = tf.einsum("ijk,kl->ijl", query_output, M) S = tf.matmul(tmp, doc_output, transpose_b=True) # [batch, q, d] S_mask = tf.matmul(query_mask, doc_mask, transpose_b=True) S_mean = S * S_mask # S_align_max = S + (1. - S_mask) * tf.float32.min # 2.2.1 Extractive Pooling # Max Pooling query_score = tf.nn.softmax(tf.reduce_max(S_align_max, axis=2, keepdims=True), axis=1) query_maxpooling = tf.reduce_sum(query_score * query_output, axis=1) # [batch, r] doc_score = tf.nn.softmax(tf.reduce_max(S_align_max, axis=1, keepdims=True), axis=2) doc_maxpooling = tf.reduce_sum(tf.transpose(doc_score, [0, 2, 1]) * doc_output, axis=1) # [batch, r] # Mean Pooling query_score = tf.nn.softmax(tf.reduce_sum(S_mean, axis=2, keepdims=True)/(tf.expand_dims(tf.expand_dims(tf.cast(doc_len, tf.float32)+self._eps, -1), -1)), axis=1) query_meanpooling = tf.reduce_sum(query_score * query_output, axis=1) # [batch, r] doc_score = tf.nn.softmax(tf.reduce_sum(S_mean, axis=1, keepdims=True)/(tf.expand_dims(tf.expand_dims(tf.cast(query_len, tf.float32)+self._eps, -1), -1)), axis=2) doc_meanpooling = tf.reduce_sum(tf.transpose(doc_score, [0, 2, 1]) * doc_output, axis=1) # [batch, r] # 2.2.2 Alignment Pooling query_alignment = tf.matmul(tf.nn.softmax(S_align_max, axis=2), doc_output) # [batch, q, r] doc_alignment = tf.matmul(tf.nn.softmax(S_align_max, axis=1), query_output, transpose_a=True) # [batch, d, r] # 2.2.3 Intra Attention query_selfattn = nn.self_attention(query_output, query_len) doc_selfattn = nn.self_attention(doc_output, doc_len) # 2.3 Multi-Cast Attention query_maxpooling = tf.tile(tf.expand_dims(query_maxpooling, axis=1), [1, query_length, 1]) query_meanpooling = tf.tile(tf.expand_dims(query_meanpooling, axis=1), [1, query_length, 1]) doc_maxpooling = tf.tile(tf.expand_dims(doc_maxpooling, axis=1), [1, doc_length, 1]) doc_meanpooling = tf.tile(tf.expand_dims(doc_meanpooling, axis=1), [1, doc_length, 1]) query_max_fc, query_max_fm, query_max_fs = self.cast_attention(query_maxpooling, query_emb, self.nn_fc, name="query_max_pooling") query_mean_fc, query_mean_fm, query_mean_fs = self.cast_attention(query_meanpooling, query_emb, self.nn_fc, name="query_mean_pooling") query_align_fcm, query_align_fm, query_align_fs = self.cast_attention(query_alignment, query_emb, self.nn_fc, name="query_align_pooling") query_selfattn_fc, query_selfattn_fm, query_selfattn_fs = self.cast_attention(query_selfattn, query_emb, self.nn_fc, name="query_self_pooling") doc_max_fc, doc_max_fm, doc_max_fs = self.cast_attention(doc_maxpooling, doc_emb, self.nn_fc, name="doc_max_pooling") doc_mean_fc, doc_mean_fm, doc_mean_fs = self.cast_attention(doc_meanpooling, doc_emb, self.nn_fc, name="doc_mean_pooling") doc_align_fcm, doc_align_fm, doc_align_fs = self.cast_attention(doc_alignment, doc_emb, self.nn_fc, name="doc_align_pooling") doc_selfattn_fc, doc_selfattn_fm, doc_selfattn_fs = self.cast_attention(doc_selfattn, doc_emb, self.nn_fc, name="doc_self_pooling") query_cast = tf.concat( [query_max_fc, query_max_fm, query_max_fs, query_mean_fc, query_mean_fm, query_mean_fs, query_align_fcm, query_align_fm, query_align_fs, query_selfattn_fc, query_selfattn_fm, query_selfattn_fs, query_output], axis=2) doc_cast = tf.concat( [doc_max_fc, doc_max_fm, doc_max_fs, doc_mean_fc, doc_mean_fm, doc_mean_fs, doc_align_fcm, doc_align_fm, doc_align_fs, doc_selfattn_fc, doc_selfattn_fm, doc_selfattn_fs, doc_output], axis=2) # query_cast = tf.concat( # [ # query_output], # axis=2) # doc_cast = tf.concat( # [doc_output], axis=2) query_cast = tf.layers.dropout(query_cast, self._dropout_rate, training=is_training) doc_cast = tf.layers.dropout(doc_cast, self._dropout_rate, training=is_training) query_hidden, _ = nn.bi_lstm(query_cast, self._hidden_dim, name="query_lstm") doc_hidden, _ = nn.bi_lstm(doc_cast, self._hidden_dim, name="doc_lstm") query_hidden = tf.concat(query_hidden, axis=2) doc_hidden = tf.concat(doc_hidden, axis=2) query_hidden = tf.layers.dropout(query_hidden, self._dropout_rate, training=is_training) doc_hidden = tf.layers.dropout(doc_hidden, self._dropout_rate, training=is_training) #query_hidden_max = query_hidden + (1. - query_mask) * tf.float32.min #doc_hidden_max = doc_hidden + (1. - doc_mask) * tf.float32.min query_hidden_mean = query_hidden * query_mask doc_hidden_mean = doc_hidden * doc_mask query_sum = tf.reduce_sum(query_hidden_mean, axis=1) query_mean = tf.div(query_sum, tf.expand_dims(tf.cast(query_len, tf.float32), -1) + self._eps) query_max = tf.reduce_max(query_hidden_mean, axis=1) query_final = tf.concat([query_mean, query_max], axis=1) doc_sum = tf.reduce_sum(doc_hidden_mean, axis=1) doc_mean = tf.div(doc_sum, tf.expand_dims(tf.cast(doc_len, tf.float32), -1) + self._eps) doc_max = tf.reduce_max(doc_hidden_mean, axis=1) doc_final = tf.concat([doc_mean, doc_max], axis=1) final = tf.concat([query_final, doc_final, query_final * doc_final, query_final - doc_final], axis=1) #yout = nn.highway_network(final, 2, dropout_rate=self._drop_rate, is_trainging=is_training) # MLP layer yout = tf.contrib.layers.fully_connected(final, self._hidden_dim, scope='fc1') # Dropout applied to classifier output_dict = self._make_output(yout, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError("The input features should contain label with vocabulary namespace " "labels int %s dataset."%mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy(labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision(labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall(labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics # output_dict['debugs'] = [] # debug_ops = [query_mean_fs]#[query_maxpooling, query_max_fc] [query_max_fm, query_max_fs],[query_mean_fc, query_mean_fm] , , # for op in debug_ops: # output_dict['debugs'].append(tf.shape(op)) # output_dict['debugs'].append(query_length) return output_dict
def forward(self, features, labels, mode, params): outputs = dict() is_training = (mode == tf.estimator.ModeKeys.TRAIN) for (feature_key, feature) in features.items(): if '/' not in feature_key: continue feature_namespace = feature_key.split("/")[1].strip() if feature_namespace == self._vocab_namespace: with tf.variable_scope("embedding/" + self._vocab_namespace, reuse=tf.AUTO_REUSE): if self._weight is None: if not self._trainable: logger.warning( "No pretrained embedding is assigned. The embedding should be trainable." ) logger.debug("loading random embedding.") if self._padding_zero: word_embeddings = tf.get_variable( "embedding_weight", shape=(self._num_embeddings - 1, self._embedding_dim), initializer=initializers.xavier_initializer(), trainable=self._trainable) pad_embeddings = tf.constant(np.zeros( [1, self._embedding_dim]), dtype=tf.float32) self._embeddings = tf.concat( [pad_embeddings, word_embeddings], axis=0) else: self._embeddings = tf.get_variable( "embedding_weight", shape=(self._num_embeddings, self._embedding_dim), initializer=initializers.xavier_initializer(), trainable=self._trainable) else: if self._weight.shape != (self._num_embeddings, self._embedding_dim): raise ConfigureError( "The parameter of embedding with shape (%s, %s), " "but the pretrained embedding with shape %s." % (self._num_embeddings, self._embedding_dim, self._weight.shape)) logger.debug( "loading pretrained embedding with trainable %s." % self._trainable) if self._padding_zero: word_embeddings = tf.get_variable( "embedding_weight", initializer=self._weight[1:, :], trainable=self._trainable) pad_embeddings = tf.constant(np.zeros( [1, self._embedding_dim]), dtype=tf.float32) self._embeddings = tf.concat( [pad_embeddings, word_embeddings], axis=0) else: self._embeddings = tf.get_variable( "embedding_weight", initializer=self._weight, trainable=self._trainable) # tf.Variable(self._weight, trainable=self._trainable, name='embedding_weight') emb = tf.nn.embedding_lookup(self._embeddings, feature) dropout_rate = params.get('dropout_rate') if dropout_rate is None: dropout_rate = self._dropout_rate emb_drop = tf.layers.dropout(emb, dropout_rate, training=is_training) if self._projection_dim: emb_drop = tf.layers.dense( emb_drop, self._projection_dim, use_bias=False, kernel_initializer=initializers.xavier_initializer( )) outputs[feature_key] = emb_drop return outputs
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") premise_tokens = features_embedding.get('premise/tokens', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) hidden_size = premise_tokens.shape[-1].value with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained premise_first_token_tensor = tf.squeeze(premise_tokens[:, 0:1, :], axis=1) hypothesis_first_token_tensor = tf.squeeze( hypothesis_tokens[:, 0:1, :], axis=1) dense_input = tf.concat([ premise_first_token_tensor, hypothesis_first_token_tensor, premise_first_token_tensor - hypothesis_first_token_tensor, premise_first_token_tensor * hypothesis_first_token_tensor ], axis=-1) output_layer = tf.layers.dense( dense_input, hidden_size, activation=tf.tanh, kernel_initializer=create_initializer( self._initializer_range)) if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_dict = self._make_output(output_layer, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) # metrics['auc'] = tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # v_1_ave, v_2_ave, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 #prem_mask = tf.expand_dims(prem_mask, -1) prem_mask = tf.cast(prem_mask, tf.bool) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) with tf.variable_scope('san_fb1'): x_fw1 = query_encode_san(premise_tokens, prem_mask, 'forward') # bs, ql, vec x_bw1 = query_encode_san(premise_tokens, prem_mask, 'backward') # bs, ql, vec x_fusion = fusion_gate(premise_tokens, prem_mask, x_fw1, x_bw1) # bs, ql, vec with tf.variable_scope('san_md'): x_code = query_encode_md(x_fusion, prem_mask) # bs, vec pre_logits = tf.nn.relu( linear(x_code, self._hidden_dim, True, scope='pre_logits_linear', is_train=True)) # bs, vec logits = linear(pre_logits, self._num_classes, False, scope='get_output', is_train=True) # bs, cn output_dict = self._make_output(logits, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def get_vocab_index_to_token(self, namespace='tokens'): if namespace not in self._token_to_index: raise ConfigureError("namespace %s not in vocabulary."%namespace) return self._index_to_token[namespace]
def __init__(self, data_reader=None, train_input_fn=None, valid_input_fn=None, test_input_fn=None, serving_feature_spec=None, model=None, hparams=HParams(), run_config: RunConfig = RunConfig()): if data_reader is not None and train_input_fn is None: self._train_input_fn, self._valid_input_fn, self._test_input_fn = self.make_input_fns( data_reader) self._serving_feature_spec = data_reader.get_raw_serving_input_receiver_features( DataSplit.EVAL) else: self._train_input_fn = train_input_fn self._valid_input_fn = valid_input_fn self._test_input_fn = test_input_fn self._serving_feature_spec = serving_feature_spec if self._train_input_fn is None: raise ConfigureError("The train dataset is not provided.") if data_reader: hparams.add_hparam("num_retrieval", data_reader.get_num_retrieval()) if model is None: raise ConfigureError("Please provide model for training.") self._model_fn = model.make_estimator_model_fn() if hparams.per_process_gpu_memory_fraction is not None and 0 < hparams.per_process_gpu_memory_fraction <= 1: session_config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = hparams.per_process_gpu_memory_fraction run_config = run_config.replace(session_config=session_config) self._estimator = tf.estimator.Estimator( model_fn=self._model_fn, config=run_config, params=hparams, warm_start_from=model.get_warm_start_setting()) train_hooks = [] if tf_version[1] >= 10 and tf_version[1] <= 13: early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook( self._estimator, metric_name='loss', max_steps_without_decrease=hparams. early_stopping_max_steps_without_decrease, min_steps=hparams.early_stopping_min_steps) train_hooks.append(early_stopping) exporters = None if self._serving_feature_spec: serving_input_receiver_fn = ( tf.estimator.export.build_raw_serving_input_receiver_fn( self._serving_feature_spec)) exporters = [] if tf_version[1] >= 9: best_exporter = tf.estimator.BestExporter( name="best_exporter", serving_input_receiver_fn=serving_input_receiver_fn, exports_to_keep=5) exporters.append(best_exporter) latest_export = tf.estimator.LatestExporter( name='latest_exporter', serving_input_receiver_fn=serving_input_receiver_fn, exports_to_keep=5) exporters.append(latest_export) self._train_spec = tf.estimator.TrainSpec( input_fn=self._train_input_fn, max_steps=hparams.train_steps, hooks=train_hooks) if self._valid_input_fn: self._valid_spec = tf.estimator.EvalSpec( input_fn=self._valid_input_fn, steps=hparams.eval_steps, exporters=exporters, throttle_secs=hparams.throttle_secs) #self._estimator.evaluate(self._valid_input_fn, steps=hparams.eval_steps, name=DataSplit.TEST) tf.estimator.train_and_evaluate(self._estimator, self._train_spec, self._valid_spec)
def forward(self, features, labels, mode, params): global_step = tf.train.get_or_create_global_step() dropout_keep_rate = tf.train.exponential_decay(self._keep_prob, global_step, self._dropout_decay_step, self._dropout_decay_rate, staircase=False, name='dropout_keep_rate') tf.summary.scalar('dropout_keep_rate', dropout_keep_rate) params.add_hparam('dropout_rate', 1 - dropout_keep_rate) features_embedding = self._embedding_mapping.forward(features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get('hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError("The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError("The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get('premise/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_ins = [] hypothesis_ins = [] premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get('premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get('hypothesis/elmo_characters', None) premise_ins.append(premise_tokens) hypothesis_ins.append(hypothesis_tokens) premise_chars = features_embedding.get('premise/chars', None) hypothesis_chars = features_embedding.get('hypothesis/chars', None) if premise_chars is not None and hypothesis_chars is not None: with tf.variable_scope("conv") as scope: conv_pre = nn.multi_conv1d_max(premise_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, dropout_keep_rate, scope='conv') scope.reuse_variables() conv_hyp = nn.multi_conv1d_max(hypothesis_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, dropout_keep_rate, scope='conv') #conv_pre = tf.reshape(conv_pre, [-1, self.sequence_length, config.char_out_size]) #conv_hyp = tf.reshape(conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_ins.append(conv_pre) hypothesis_ins.append(conv_hyp) premise_pos = features_embedding.get('premise/pos_tags', None) hypothesis_pos = features_embedding.get('hypothesis/pos_tags', None) if premise_pos is not None and hypothesis_pos is not None: premise_ins.append(premise_pos) hypothesis_ins.append(hypothesis_pos) premise_exact_match = features.get('premise/exact_match_labels', None) hypothesis_exact_match = features.get('hypothesis/exact_match_labels', None) if premise_exact_match is not None and hypothesis_exact_match is not None: premise_ins.append(tf.expand_dims(tf.cast(premise_exact_match, tf.float32), -1)) hypothesis_ins.append(tf.expand_dims(tf.cast(hypothesis_exact_match, tf.float32), -1)) premise_in = tf.concat(premise_ins, axis=2) hypothesis_in = tf.concat(hypothesis_ins, axis=2) with tf.variable_scope("highway") as scope: premise_in = nn.highway_network(premise_in, self._highway_num_layers) scope.reuse_variables() hypothesis_in = nn.highway_network(hypothesis_in, self._highway_num_layers) with tf.variable_scope("prepro") as scope: pre = premise_in hyp = hypothesis_in for i in range(self._num_self_att_enc_layers): with tf.variable_scope("attention_encoder_%s" % i, reuse=False): pre_att = nn.self_attention(pre, prem_seq_lengths, func='tri_linear', scope="premise_self_attention") p = nn.fuse_gate(pre, pre_att, scope="premise_fuse_gate") hyp_att = nn.self_attention(hyp, hyp_seq_lengths, func='tri_linear', scope="hypothesis_self_attention") h = nn.fuse_gate(hyp, hyp_att, scope="hypothesis_fuse_gate") pre = p hyp = h nn.variable_summaries(p, "p_self_enc_summary_layer_{}".format(i)) nn.variable_summaries(h, "h_self_enc_summary_layer_{}".format(i)) with tf.variable_scope("main") as scope: pre = p hyp = h with tf.variable_scope("interaction"): pre_length = tf.shape(pre)[1] hyp_length = tf.shape(hyp)[1] pre_new = tf.tile(tf.expand_dims(pre, 2), [1, 1, hyp_length, 1]) hyp_new = tf.tile(tf.expand_dims(hyp, 1), [1, pre_length, 1, 1]) bi_att_mx = pre_new * hyp_new # mask = tf.expand_dims(tf.sequence_mask(query_len, tf.shape(query)[1], dtype=tf.float32), # axis=2) * \ # tf.expand_dims(tf.sequence_mask(key_len, tf.shape(key)[1], dtype=tf.float32), axis=1) bi_att_mx = tf.layers.dropout(bi_att_mx, 1-dropout_keep_rate, training=is_training) with tf.variable_scope("dense_net"): dim = bi_att_mx.get_shape().as_list()[-1] act = tf.nn.relu if self._first_scale_down_layer_relu else None fm = tf.contrib.layers.convolution2d(bi_att_mx, int(dim * self._dense_net_first_scale_down_ratio), self._first_scale_down_kernel, padding="SAME", activation_fn=act) fm = nn.dense_net_block(fm, self._dense_net_growth_rate, self._num_dense_net_layers, self._dense_net_kernel_size, scope="first_dense_net_block") fm = nn.dense_net_transition_layer(fm, self._dense_net_transition_rate, scope='second_transition_layer') fm = nn.dense_net_block(fm, self._dense_net_growth_rate, self._num_dense_net_layers, self._dense_net_kernel_size, scope="second_dense_net_block") fm = nn.dense_net_transition_layer(fm, self._dense_net_transition_rate, scope='third_transition_layer') fm = nn.dense_net_block(fm, self._dense_net_growth_rate, self._num_dense_net_layers, self._dense_net_kernel_size, scope="third_dense_net_block") fm = nn.dense_net_transition_layer(fm, self._dense_net_transition_rate, scope='fourth_transition_layer') shape_list = list(fm.get_shape()) #print(shape_list) premise_final = tf.reshape(fm, [-1, shape_list[1] * shape_list[2] * shape_list[3]]) output_dict = self._make_output(premise_final, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError("The input features should contain label with vocabulary namespace " "labels int %s dataset."%mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) #######l2 loss################# if self._l2_loss: if self._sigmoid_growing_l2loss: weights_added = tf.add_n([tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") or tensor.name.endswith('kernel:0') or tensor.name.endswith('filter:0')]) full_l2_step = tf.constant(self._weight_l2loss_step_full_reg, dtype=tf.int32, shape=[], name='full_l2reg_step') full_l2_ratio = tf.constant(self._l2_regularization_ratio, dtype=tf.float32, shape=[], name='l2_regularization_ratio') gs_flt = tf.cast(global_step, tf.float32) half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32) # (self.global_step - full_l2_step / 2) # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32) # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) / half_l2_step_flt) * full_l2_ratio tf.summary.scalar('l2loss_ratio', l2loss_ratio) l2loss = weights_added * l2loss_ratio else: l2loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") or tensor.name.endswith( 'kernel:0')]) * tf.constant(self._l2_regularization_ratio, dtype='float', shape=[], name='l2_regularization_ratio') tf.summary.scalar('l2loss', l2loss) ######diff loss############################### diffs = [] for i in range(self._num_self_att_enc_layers): for tensor in tf.trainable_variables(): #print(tensor.name) if tensor.name == "diin/prepro/attention_encoder_{}/premise_self_attention/similar_mat/similar_func/arg/kernel:0".format( i): l_lg = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_self_attention/similar_mat/similar_func/arg/kernel:0".format( i): r_lg = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/lhs_1/kernel:0".format(i): l_fg_lhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/lhs_1/kernel:0".format( i): r_fg_lhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/rhs_1/kernel:0".format(i): l_fg_rhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/rhs_1/kernel:0".format( i): r_fg_rhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/lhs_2/kernel:0".format(i): l_fg_lhs_2 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/lhs_2/kernel:0".format( i): r_fg_lhs_2 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/rhs_2/kernel:0".format(i): l_fg_rhs_2 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/rhs_2/kernel:0".format( i): r_fg_rhs_2 = tensor if tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/lhs_3/kernel:0".format( i): l_fg_lhs_3 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/lhs_3/kernel:0".format( i): r_fg_lhs_3 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/rhs_3/kernel:0".format( i): l_fg_rhs_3 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/rhs_3/kernel:0".format( i): r_fg_rhs_3 = tensor diffs += [l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1, l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2, l_fg_rhs_2 - r_fg_rhs_2] diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3] diff_loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in diffs]) * tf.constant( self._diff_penalty_loss_ratio, dtype='float', shape=[], name='diff_penalty_loss_ratio') tf.summary.scalar('diff_loss', diff_loss) ############################### output_dict['loss'] = loss + l2loss + diff_loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy(labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision(labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall(labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def __init__(self, data_reader=None, eval_input_fn=None, num_classes=None, vocab=None, export_dir=None, output_file=None, hparams=HParams()): if data_reader is not None and eval_input_fn is None: self._eval_input_fn = data_reader.make_estimator_input_fn( DataSplit.EVAL, force_repeat=False) vocab = data_reader.get_vocab() else: self._eval_input_fn = eval_input_fn if num_classes is None: num_classes = vocab.get_vocab_size(namespace='labels') task = hparams.get('task', 'classification') task_type = hparams.get('task_type', 'multiclass') labels = list(range(num_classes)) dataset = self._eval_input_fn() iterator = dataset.make_initializable_iterator() dataset.make_initializable_iterator() next_element = iterator.get_next() self.saved_model_loader = loader_impl.SavedModelLoader(export_dir) mode = DataSplit.PREDICT signature_def = get_signature_def_for_mode(self.saved_model_loader, mode) input_map = generate_input_map(signature_def, next_element) output_tensor_names = [ value.name for value in signature_def.outputs.values() ] try: tags = model_fn.EXPORT_TAG_MAP[mode] except AttributeError as e: tags = ['serve'] saver, output_tensors = self.saved_model_loader.load_graph( tf.get_default_graph(), tags, input_map=input_map, return_elements=output_tensor_names) output_map = dict(zip(output_tensor_names, output_tensors)) outputs = { key: output_map[value.name] for (key, value) in signature_def.outputs.items() } # predict_fn = tf.contrib.predictor.from_saved_model(export_dir) #####xlsx wirte###### tsv_file = open(output_file, 'w') # wb = Workbook(write_only=True) # ws = wb.create_sheet('examples') # ws.append(['question', 'answer', 'true_label', 'predict', 'score']) y_true = [] y_pred = [] total_num = 0 # accuracy = 0 # confusion_matrix = [[0 for j in range(num_classes)] for i in range(num_classes)] if hparams.per_process_gpu_memory_fraction is not None and 0 < hparams.per_process_gpu_memory_fraction <= 1: session_config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = hparams.per_process_gpu_memory_fraction else: session_config = tf.ConfigProto() with tf.Session(config=session_config) as sess: self.saved_model_loader.restore_variables(sess, saver) self.saved_model_loader.run_init_ops(sess, tags) sess.run(iterator.initializer) while True: try: outputs['inputs'] = next_element output_vals = sess.run(outputs) data_batch = output_vals['inputs'] if 'premise/tokens' in data_batch.keys( ) and 'hypothesis/tokens' in data_batch.keys(): premise_tokens_val, hypothesis_tokens_val, true_label_val = \ data_batch['premise/tokens'], data_batch['hypothesis/tokens'], data_batch['label/labels'] else: true_label_val = data_batch['label/labels'] premise_tokens_val = [ [] for i in range(len(true_label_val)) ] hypothesis_tokens_val = [ [] for i in range(len(true_label_val)) ] # probs = output_vals['output_score'] probs = output_vals['output'] num_batch = probs.shape[0] total_num += num_batch print("processing %s/%s" % (num_batch, total_num)) ####################### # print(probs) if task_type == 'multiclass': predictions_val = np.argmax(probs, axis=1) elif task_type == 'multilabel': threshold = hparams.get('threshold', 0.5) predictions_val = (probs > threshold).astype( dtype=np.int32) elif task_type == 'topk': predictions_val = (probs > 0).astype(dtype=np.int32) else: raise ConfigureError( "Task type %s is not support for task %s. " "Only multiclass and multilabel is support for task %s" % (task_type, task, task)) # predictions = (probs > 0.5).astype(np.int32) # print(predictions) y_true.append(true_label_val) y_pred.append(predictions_val) # print(predictions) # for i in range(probs.shape[0]): # predictions = (probs > 0.5).astype(np.int32) # predict = predictions[i] # label = true_label_val[i] # if predict == label: # accuracy += 1 # confusion_matrix[label][predict] += 1 ################ for i in range(num_batch): premise_str = vocab.convert_indexes_to_tokens( premise_tokens_val[i], 'tokens') premise_str = " ".join(premise_str) hypothesis_str = vocab.convert_indexes_to_tokens( hypothesis_tokens_val[i], 'tokens') hypothesis_str = " ".join(hypothesis_str) if task_type == 'multilabel' or task_type == 'topk': predictions = [[] for i in range(num_batch)] for (row, col) in np.argwhere(predictions_val == 1): predictions[row].append(col) true_labels = [[] for i in range(num_batch)] for row, col in np.argwhere(true_label_val == 1): true_labels[row].append(col) else: predictions = predictions_val true_labels = true_label_val true_label = true_labels[i] predict = predictions[i] prob = probs[i] if task_type == 'multiclass': tsv_str = "\t".join([ premise_str, hypothesis_str, vocab.get_index_token(true_label, namespace='labels'), vocab.get_index_token(predict, namespace='labels'), str(prob) ]) elif task_type == 'multilabel' or task_type == 'topk': tsv_str = "\t".join([ premise_str, hypothesis_str, " ".join([ vocab.get_index_token(l, namespace='labels') for l in true_label ]), " ".join([ vocab.get_index_token(p, namespace='labels') for p in predict ]), str(prob) ]) else: raise ConfigureError( "Task type %s is not support for task %s. " "Only multiclass and multilabel is support for task %s" % (task_type, task, task)) # tsv_str = "\t".join([premise_str, hypothesis_str, str(true_label), str(predict), str(prob), # json.dumps(output_vals['query_embedding'][i].tolist()), json.dumps(output_vals['title_embedding'][i].tolist()), # json.dumps(output_vals['query_lstm_1'][i].tolist()), json.dumps(output_vals['title_lstm_1'][i].tolist()), # json.dumps(output_vals['query_attention'][i].tolist()), json.dumps(output_vals['title_attention'][i].tolist()), # json.dumps(output_vals['query_lstm_2'][i].tolist()), json.dumps(output_vals['title_lstm_2'][i].tolist()), # json.dumps(output_vals['fc1'][i].tolist()), json.dumps(output_vals['fc2'][i].tolist()) # ]) tsv_file.write(tsv_str + "\n") # print("process %s/%s correct/total instances with accuracy %s." % (accuracy, total_num, accuracy/float(total_num))) except tf.errors.OutOfRangeError as e: logger.info("processed all the evalutation data") break # logger.warning(e) y_true = np.concatenate(y_true, axis=0) y_pred = np.concatenate(y_pred, axis=0) avg_param = 'micro' if num_classes == 2: avg_param = 'binary' accuracy = metrics.accuracy_score(y_true, y_pred) # accuracy/total_num precise, recall, f1score, support = metrics.precision_recall_fscore_support( y_true, y_pred, labels=labels, average=avg_param) if task_type == 'multiclass': confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=labels) print("metrics:") confmx_str = "label \ predict " for i in range(num_classes): confmx_str += "| %s | " % vocab.get_index_token( i, namespace='labels') confmx_str += "\n" for i in range(num_classes): confmx_str += "| %s | " % vocab.get_index_token( i, namespace='labels') for j in range(num_classes): confmx_str += "| %s | " % confusion_matrix[i][j] confmx_str += "\n" print(confmx_str) elif task_type == 'multilabel' or task_type == 'topk': confusion_matrix = metrics.multilabel_confusion_matrix( y_true, y_pred) print("metrics:") for k in range(num_classes): print("confusion matrix for label %s" % vocab.get_index_token(k, namespace='labels')) confmx_str = "label \ predict " for i in range(2): confmx_str += "| %s | " % i confmx_str += "\n" for i in range(2): confmx_str += "| %s | " % i for j in range(2): confmx_str += "| %s | " % confusion_matrix[k][i][j] confmx_str += "\n" print(confmx_str) else: raise ConfigureError( "Task type %s is not support for task %s. " "Only multiclass and multilabel is support for task %s" % (task_type, task, task)) # confusion_matrix[1][1]/(confusion_matrix[0][1]+confusion_matrix[1][1]) # recall = confusion_matrix[1][1]/(confusion_matrix[1][0]+confusion_matrix[1][1]) # f1score = (precise+recall)/2 print("micro total accuracy precise recall f1-score") print( "accuracy: %.2f, precise: %.2f, recall: %.2f, f1-score: %.2f" % (accuracy, precise, recall, f1score)) precisions, recalls, fbeta_scores, supports = metrics.precision_recall_fscore_support( y_true, y_pred, labels=labels) print("accuracy precise recall f1-score for each class") print( '======================================================================================' ) for lab_idx, (precision, recall, fbeta_score, support) in enumerate( zip(precisions, recalls, fbeta_scores, supports)): print( "label:%s\tprecision:%.2f\trecall:%.2f\tf1-score:%.2f\tsupport:%.2f" % (vocab.get_index_token(lab_idx, namespace='labels'), precision, recall, fbeta_score, support)) # legend = ["label \ predict "] # for i in range(num_classes): # legend.append(str(i)) # ws.append(legend) # for i in range(num_classes): # row = [str(i)] # for j in range(num_classes): # row.append(str(confusion_matrix[i][j])) # ws.append(row) # ws.append([]) # ws.append([]) # ws.append(['accuracy', 'precise', 'recall', 'f1-score']) # ws.append([str(accuracy), str(precise), str(recall), str(f1score)]) # if output_file: # if not output_file.endswith(".xlsx"): # output_file += '.xlsx' # wb.save(output_file) tsv_file.close()
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward(features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get('hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError("The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError("The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get('premise/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get('premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get('hypothesis/elmo_characters', None) lm_xor = keras.layers.Lambda(self._xor_match)([premise_tokens_ids, hypothesis_tokens_ids]) lm_conv = keras.layers.Conv1D( self._lm_filters, premise_tokens_ids.shape[1].value, padding='valid', activation=self._activation_func )(lm_xor) lm_conv = keras.layers.Dropout(self._dropout_rate)( lm_conv, training=is_training) lm_feat = keras.layers.Reshape((lm_conv.shape[2].value, ))(lm_conv) for hidden_size in self._lm_hidden_sizes: lm_feat = keras.layers.Dense( hidden_size, activation=self._activation_func )(lm_feat) lm_drop = keras.layers.Dropout(self._dropout_rate)( lm_feat, training=is_training) lm_score = keras.layers.Dense(1)(lm_drop) dm_q_conv = keras.layers.Conv1D( self._dm_filters, self._dm_kernel_size, padding='same', activation=self._activation_func )(premise_tokens) dm_q_conv = keras.layers.Dropout(self._dropout_rate)( dm_q_conv, training=is_training) dm_q_mp = keras.layers.MaxPooling1D( pool_size=premise_tokens_ids.shape[1].value)(dm_q_conv) dm_q_rep = keras.layers.Reshape((dm_q_mp.shape[2].value, ))(dm_q_mp) dm_q_rep = keras.layers.Dense(self._dm_q_hidden_size)( dm_q_rep) dm_q_rep = keras.layers.Lambda(lambda x: tf.expand_dims(x, 1))( dm_q_rep) dm_d_conv1 = keras.layers.Conv1D( self._dm_filters, self._dm_kernel_size, padding='same', activation=self._activation_func )(hypothesis_tokens) dm_d_conv1 = keras.layers.Dropout(self._dropout_rate)( dm_d_conv1, training=is_training) dm_d_mp = keras.layers.MaxPooling1D( pool_size=self._dm_d_mpool)(dm_d_conv1) dm_d_conv2 = keras.layers.Conv1D( self._dm_filters, 1, padding='same', activation=self._activation_func )(dm_d_mp) dm_d_conv2 = keras.layers.Dropout(self._dropout_rate)( dm_d_conv2, training=is_training) h_dot = dm_q_rep * dm_d_conv2 #keras.layers.Lambda(self._hadamard_dot)([dm_q_rep, dm_d_conv2]) dm_feat = keras.layers.Reshape((h_dot.shape[1].value*h_dot.shape[2].value, ))(h_dot) for hidden_size in self._dm_hidden_sizes: dm_feat = keras.layers.Dense(hidden_size)(dm_feat) dm_feat_drop = keras.layers.Dropout(self._dropout_rate)( dm_feat, training=is_training) dm_score = keras.layers.Dense(1)(dm_feat_drop) add = keras.layers.Add()([lm_score, dm_score]) # Get prediction output_dict = self._make_output(add, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError("The input features should contain label with vocabulary namespace " "labels int %s dataset."%mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy(labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision(labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall(labels=labels, predictions=output_dict['predictions']) # metrics['map'] = tf.metrics.average_precision_at_k(labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], # k=2) # metrics['precision_1'] = tf.metrics.precision_at_k(labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], # k=1, class_id=1) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict