def init_from_params(cls, params, vocab): config_file = params.pop('config_file', None) if config_file is None: raise ConfigureError( "Please provide ELMo config file for ELMo embedding.") # weight_file = params.pop('weight_file', None) # if weight_file is None: # logger.warning("The ELMo embedding is initialize randomly.") encoder_name = params.pop("encoder_name", "elmo") vocab_namespace = params.pop('namespace', 'elmo_characters') dropout_rate = params.pop_float('dropout_rate', 0.0) ckpt_to_initialize_from = params.pop('ckpt_to_initialize_from', None) weight_file = params.pop('weight_file', None) if ckpt_to_initialize_from is None and weight_file is None: logger.warning("The ELMo embedding is initialize randomly.") # tmp_dir = params.pop('tmp_dir', None) # if tmp_dir is None: # if weight_file: # tmp_dir = os.path.dirname(weight_file) # else: # tmp_dir = "./" params.assert_empty(cls.__name__) return cls(config_file=config_file, ckpt_to_initialize_from=ckpt_to_initialize_from, dropout_rate=dropout_rate, encoder_name=encoder_name, vocab_namespace=vocab_namespace, weight_file=weight_file)
def init_from_params(cls, params, vocab): config_file = params.pop('config_file', None) if config_file is None: raise ConfigureError("Please provide bert config file for bert embedding.") old_vocab_file = params.pop('vocab_file', None) if old_vocab_file is None: logger.warning("The vocab file is not provided. We consider the embedding vocab is the same as the data " "vocab acquiescently.") ckpt_to_initialize_from = params.pop('ckpt_to_initialize_from', None) if ckpt_to_initialize_from is None: logger.warning("The bert embedding is initialize randomly.") num_oov_buckets = params.pop_int("num_oov_buckets", 0) use_one_hot_embeddings = params.pop_bool("use_one_hot_embeddings", False) encoder_name = params.pop("encoder_name", "bert") vocab_namespace = params.pop("namespace", 'tokens') mask_namespace = params.pop("mask_namespace", None) new_vocab_file = vocab.get_vocab_path(vocab_namespace) new_vocab_size = vocab.get_vocab_size(vocab_namespace) projection_dim = params.pop_int("projection_dim", None) dropout_rate = params.pop_float("dropout_rate", 0.0) remove_bos_eos = params.pop_bool("remove_bos_eos", True) params.assert_empty(cls.__name__) return cls(config_file=config_file, ckpt_to_initialize_from=ckpt_to_initialize_from, new_vocab_file=new_vocab_file, new_vocab_size=new_vocab_size, num_oov_buckets= num_oov_buckets, old_vocab_file=old_vocab_file, vocab_namespace=vocab_namespace, remove_bos_eos = remove_bos_eos, mask_namespace=mask_namespace, projection_dim=projection_dim, dropout_rate=dropout_rate, use_one_hot_embeddings=use_one_hot_embeddings, encoder_name=encoder_name)
def generate_input_map(self, signature_def, features, labels=None): features_mapping = { "input_query": "premise/tokens", "input_title": "hypothesis/tokens" } inputs = signature_def.inputs input_map = {} for (key, tensor_info) in inputs.items(): input_name = tensor_info.name if ':' in input_name: input_name = input_name[:input_name.find(':')] control_dependency_name = '^' + input_name if features_mapping is not None and key in features_mapping: feature_key = features_mapping[key] else: feature_key = key if feature_key in features: check_same_dtype_and_shape(features[feature_key], tensor_info, key) input_map[input_name] = input_map[ control_dependency_name] = features[feature_key] elif labels is not None and feature_key in labels: check_same_dtype_and_shape(labels[feature_key], tensor_info, key) input_map[input_name] = input_map[ control_dependency_name] = labels[feature_key] else: logger.warning( 'Key \"%s\" not found in features or labels passed in to the model ' 'function. All required keys: %s' % (feature_key, inputs.keys())) return input_map
def get_signature_def_for_mode(saved_model_loader, mode): meta_graph_def = get_meta_graph_def_for_mode(saved_model_loader, mode) sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY if mode == model_fn.ModeKeys.PREDICT else mode) if sig_def_key not in meta_graph_def.signature_def: logger.warning('Metagraph for mode %s was found, but SignatureDef with' ' key \"%s\" is missing.' % (mode, sig_def_key)) return None return meta_graph_def.signature_def[sig_def_key]
def get_warm_start_setting(self): warm_start_settings = None for encoder in self._encoders.values(): warm_start_settings_namespace = encoder.get_warm_start_setting() if isinstance(warm_start_settings_namespace, tf.estimator.WarmStartSettings): if warm_start_settings is None: warm_start_settings = warm_start_settings_namespace else: logger.warning("There are two pretrained embedding, which is not supported int this toolkit now.") return warm_start_settings
def save_to_files(self, directory): os.makedirs(directory, exist_ok=True) if os.listdir(directory): logger.warning("Vocabulary directory %s is not empty", directory) save_to_txt(self._non_padded_namespaces, os.path.join(directory, NAMESPACE_PADDING_FILE)) for namespace in self._token_to_index: vocab_namespace = [self._index_to_token[namespace][i] for i in range(len(self._index_to_token[namespace]))] vocab_namespace_file = os.path.join(directory, VOCAB_FILE % namespace) self._namespace_to_path[namespace] = vocab_namespace_file save_to_txt(vocab_namespace, vocab_namespace_file)
def _read_pretrained_embeddings_text(pretrained_file, embedding_dim, vocab, vocab_namespace): vocab_tokens = vocab.get_vocab_tokens(vocab_namespace) vocab_size = vocab.get_vocab_size(vocab_namespace) embeddings = {} logger.info("Reading pretrained embeddings from: %s" % pretrained_file) with open(pretrained_file, 'r', encoding='utf-8') as embeddings_file: for line in tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in vocab_tokens: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: ConfigureError( "The embedding_dim or vocabulary does not fit the pretrained embedding." ) all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) embedding_matrix = np.random.normal(embeddings_mean, embeddings_std, (vocab_size, embedding_dim)) embedding_matrix = embedding_matrix.astype(np.float32) num_tokens_found = 0 index_to_tokens = vocab.get_vocab_index_to_token(vocab_namespace) for i in range(vocab_size): token = index_to_tokens[i] if token in embeddings: embedding_matrix[i] = embeddings[token] num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def forward(self, features, labels, mode, params): outputs = dict() is_training = (mode == tf.estimator.ModeKeys.TRAIN) for (feature_key, feature) in features.items(): if '/' not in feature_key: continue feature_key_fields = feature_key.split("/") feature_namespace = feature_key_fields[1].strip() field_name = feature_key_fields[0].strip() if feature_namespace == self._vocab_namespace: with tf.variable_scope("embedding/"+self._vocab_namespace, reuse=tf.AUTO_REUSE): input_ids = feature input_mask = None if self._mask_namespace: mask_feature_key = field_name+"/"+self._mask_namespace if mask_feature_key in features: input_mask = features[field_name+"/"+self._mask_namespace] else: logger.warning("The mask namespace %s with field name %s is not in features (%s)" % (self._mask_namespace, field_name, mask_feature_key)) if input_mask is None: input_length, input_mask = nn.length(input_ids) else: input_length, _ = nn.length(input_ids) model = BertModel( config=self._bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, use_one_hot_embeddings=self._use_one_hot_embeddings) embedding_output = model.get_sequence_output() if self._remove_bos_eos: embedding_output = nn.remove_bos_eos(embedding_output, input_length) dropout_rate = params.get('dropout_rate') if dropout_rate is None: dropout_rate = self._dropout_rate emb_drop = tf.layers.dropout(embedding_output, dropout_rate, training=is_training) if self._projection_dim: emb_drop = tf.layers.dense(emb_drop, self._projection_dim, use_bias=False, kernel_initializer=initializers.xavier_initializer()) outputs[feature_key] = emb_drop return outputs
def extract_available_modes(saved_model_loader): """Return list of modes found in SavedModel.""" available_modes = [] logger.info('Checking available modes.') for mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT ]: try: get_meta_graph_def_for_mode(saved_model_loader, mode) except RuntimeError: logger.warning('%s mode not found in SavedModel.' % mode) continue if get_signature_def_for_mode(saved_model_loader, mode) is not None: available_modes.append(mode) logger.info('Available modes: %s' % available_modes) return available_modes
def load_from_files(self, directory): if not os.path.exists(directory): logger.warning("Vocabulary directory %s does not exist.", directory) return False namespaces_file = os.path.join(directory, NAMESPACE_PADDING_FILE) if not os.path.exists(namespaces_file): logger.warning("Vocabulary namespaces file %s does not exist", namespaces_file) return False vocab_filenames = [filename for filename in os.listdir(directory) if filename.startswith(VOCAB_FILE[:6]) and filename.endswith(VOCAB_FILE[-4:])] if len(vocab_filenames) == 0: logger.warning("Vocabulary file %s does not exist") self._non_padded_namespaces = load_from_txt(namespaces_file) for vocab_filename in vocab_filenames: namespace = vocab_filename[6:-4] vocab_namespace_file = os.path.join(directory, vocab_filename) self._namespace_to_path[namespace] = vocab_namespace_file vocab_namespace = load_from_txt(vocab_namespace_file) self._index_to_token[namespace] = dict((index, token) for index, token in enumerate(vocab_namespace)) self._token_to_index[namespace] = dict((token, index) for index, token in enumerate(vocab_namespace)) if self.valid(): return True else: raise ConfigureError("Vocabulary valid error")
def forward(self, features, labels, mode, params): outputs = dict() is_training = (mode == tf.estimator.ModeKeys.TRAIN) for (feature_key, feature) in features.items(): if '/' not in feature_key: continue feature_namespace = feature_key.split("/")[1].strip() if feature_namespace == self._vocab_namespace: with tf.variable_scope("embedding/" + self._vocab_namespace, reuse=tf.AUTO_REUSE): if self._weight is None: if not self._trainable: logger.warning( "No pretrained embedding is assigned. The embedding should be trainable." ) logger.debug("loading random embedding.") if self._padding_zero: word_embeddings = tf.get_variable( "embedding_weight", shape=(self._num_embeddings - 1, self._embedding_dim), initializer=initializers.xavier_initializer(), trainable=self._trainable) pad_embeddings = tf.constant(np.zeros( [1, self._embedding_dim]), dtype=tf.float32) self._embeddings = tf.concat( [pad_embeddings, word_embeddings], axis=0) else: self._embeddings = tf.get_variable( "embedding_weight", shape=(self._num_embeddings, self._embedding_dim), initializer=initializers.xavier_initializer(), trainable=self._trainable) else: if self._weight.shape != (self._num_embeddings, self._embedding_dim): raise ConfigureError( "The parameter of embedding with shape (%s, %s), " "but the pretrained embedding with shape %s." % (self._num_embeddings, self._embedding_dim, self._weight.shape)) logger.debug( "loading pretrained embedding with trainable %s." % self._trainable) if self._padding_zero: word_embeddings = tf.get_variable( "embedding_weight", initializer=self._weight[1:, :], trainable=self._trainable) pad_embeddings = tf.constant(np.zeros( [1, self._embedding_dim]), dtype=tf.float32) self._embeddings = tf.concat( [pad_embeddings, word_embeddings], axis=0) else: self._embeddings = tf.get_variable( "embedding_weight", initializer=self._weight, trainable=self._trainable) # tf.Variable(self._weight, trainable=self._trainable, name='embedding_weight') emb = tf.nn.embedding_lookup(self._embeddings, feature) dropout_rate = params.get('dropout_rate') if dropout_rate is None: dropout_rate = self._dropout_rate emb_drop = tf.layers.dropout(emb, dropout_rate, training=is_training) if self._projection_dim: emb_drop = tf.layers.dense( emb_drop, self._projection_dim, use_bias=False, kernel_initializer=initializers.xavier_initializer( )) outputs[feature_key] = emb_drop return outputs
def forward(self, features, labels, mode, params): if self._sim_func != 'tensor' and self._num_tensor_dim != 1: self._num_tensor_dim = 1 logger.warning( "The similarity function is tensor layer. The number of tensor dim is not effective." ) features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) prem_hyp_mask = tf.matmul(prem_mask, hyp_mask, transpose_b=True) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) premise_outs, c1 = nn.bi_lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = nn.bi_lstm(hypothesis_tokens, self._hidden_dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) max_premise_length = premise_tokens.shape[1].value max_hypothesis_length = hypothesis_tokens.shape[1].value if self._sim_func == 'tensor': M = tf.Variable( tf.random_normal([ self._num_tensor_dim, 2 * self._hidden_dim, 2 * self._hidden_dim ], stddev=0.1)) W = tf.Variable( tf.random_normal([4 * self._hidden_dim, 1], stddev=0.1)) bias = tf.Variable(tf.zeros([1]), name="tensor_bias") premise_ex = tf.tile(tf.expand_dims(premise_bi, axis=2), [1, 1, max_hypothesis_length, 1]) hypothesis_ex = tf.tile(tf.expand_dims(hypothesis_bi, axis=1), [1, max_premise_length, 1, 1]) tensor = [] tmp2 = tf.einsum("abcd,df->abcf", tf.concat([premise_ex, hypothesis_ex], axis=3), W) # [N, L1, L2, 1] tmp2 = tf.squeeze(tmp2, axis=3) for i in range(self._num_tensor_dim): tmp1 = tf.einsum("abc,cd->abd", premise_bi, M[i]) # [N, L1, 2d] tmp1 = tf.matmul(tmp1, hypothesis_bi, transpose_b=True) # [N, L1, L2] tensor.append(tf.nn.relu(tmp1 + tmp2 + bias)) tensor = tf.concat([tensor], axis=0) elif self._sim_func == 'cosine': tensor = tf.matmul(tf.nn.l2_normalize(premise_bi, axis=-1), tf.nn.l2_normalize(hypothesis_bi, axis=-1), transpose_b=True) # [N, L1, L2] elif self._sim_func == 'bilinear': M = tf.Variable( tf.random_normal( [2 * self._hidden_dim, 2 * self._hidden_dim], stddev=0.1)) b = tf.Variable( tf.random_normal( [max_premise_length, max_hypothesis_length], stddev=0.1)) bilinear = tf.einsum("abc,cd->abd", premise_bi, M) # [N, L1, 2d] tensor = tf.matmul(bilinear, hypothesis_bi, transpose_b=True) + b # [N, L1, L2] else: raise ConfigureError( "The simility function %s is not supported. " "The mvlstm only support simility function for [cosine, bilinear, tensor]." % self._sim_func) tensor *= prem_hyp_mask # 3.1 k-Max Pooling matrix_in = tf.reshape( tensor, [-1, max_premise_length * max_hypothesis_length]) values, indices = tf.nn.top_k(matrix_in, k=self._num_k, sorted=False) kmax = tf.reshape(values, [-1, self._num_tensor_dim * self._num_k]) # MLP layer h_mlp_1 = tf.contrib.layers.fully_connected(kmax, self._num_tensor_dim * self._num_k, scope='fc1') h_mlp_1_drop = tf.layers.dropout(h_mlp_1, self._dropout_rate, training=is_training) h_mlp_2 = tf.contrib.layers.fully_connected(h_mlp_1_drop, self._num_tensor_dim * self._num_k // 2, scope='fc2') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp_2, self._dropout_rate, training=is_training) # Get prediction output_dict = self._make_output(h_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict