def _set_inverse_vocabulary(self, vocab): """Sets vocabulary data for this layer when inverse is True.""" table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None has_mask = vocab[0] == self.mask_token insert_special_tokens = should_have_mask and not has_mask special_tokens = [] if self.mask_token is None else [self.mask_token] num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError( "Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if insert_special_tokens: total_vocab_size = len(vocab) + num_special_tokens else: total_vocab_size = len(vocab) if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) start_index = num_special_tokens if insert_special_tokens else 0 values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) self._table_handler.clear() self._table_handler.insert(values, vocab) if insert_special_tokens and num_special_tokens > 0: special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_token_values, special_tokens) return total_vocab_size
def _set_forward_vocabulary(self, vocab, idf_weights=None): """Sets vocabulary data for this layer when inverse is False.""" table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None has_mask = vocab[0] == self.mask_token oov_start = 1 if should_have_mask else 0 should_have_oov = (self.num_oov_indices > 0) and not self.invert if should_have_oov: oov_end = oov_start + self.num_oov_indices expected_oov = [self.oov_token] * self.num_oov_indices has_oov = vocab[oov_start:oov_end] == expected_oov # If we get a numpy array, then has_oov may end up being a numpy array # instead of a bool. Fix this by collapsing the variable if it's not bool. if not isinstance(has_oov, bool): has_oov = any(has_oov) else: has_oov = False if all([should_have_mask, has_mask, should_have_oov]) and not has_oov: raise ValueError( "Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should" " be included in the provided vocabulary. " "The passed vocabulary has the correct mask token `%s` " "at index 0, but does not have the OOV token `%s` in " "indices [%s:%s]. Instead, we found `%s`. Was this " "vocabulary generated by a layer with incompatible " "settings?" % (self.mask_token, self.oov_token, self.mask_token, self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end])) if all([should_have_oov, has_oov, should_have_mask]) and not has_mask: raise ValueError( "Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should " "be included in the provided vocabulary. " "The passed vocabulary has the correct OOV token `%s` at " "indices [%s:%s], but does not have the mask token `%s` in " "index 0. Instead, we found `%s`. Was this vocabulary " "generated by a layer with incompatible settings?" % (self.mask_token, self.oov_token, self.oov_token, oov_start, oov_end, self.mask_token, vocab[0])) special_tokens = [] if self.mask_token is None else [self.mask_token] special_tokens.extend([self.oov_token] * self.num_oov_indices) insert_special_tokens = special_tokens and not has_oov and not has_mask num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError( "Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if self.oov_token in tokens: raise ValueError( "Reserved OOV token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "OOV token for this layer." % (self.oov_token, tokens.index(self.oov_token))) total_vocab_size = len(tokens) + num_special_tokens if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) self._table_handler.clear() if insert_special_tokens: start_index = num_special_tokens values = np.arange(start_index, len(tokens) + start_index, dtype=np.int64) self._table_handler.insert(tokens, values) special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_tokens, special_token_values) else: values = np.arange(len(vocab), dtype=np.int64) self._table_handler.insert(vocab, values) if self.output_mode == TFIDF: if idf_weights is None: raise ValueError( "idf_weights must be set if output_mode is TFIDF") if len(vocab) != len(idf_weights): raise ValueError( "idf_weights must be the same length as vocab. " "len(idf_weights) is %s, len(vocab) is %s" % (len(vocab), len(idf_weights))) idf_weights = self._convert_to_ndarray(idf_weights) if idf_weights.ndim != 1: raise ValueError( "TF-IDF data must be a 1-index array, but received {}". format(type(idf_weights))) # If we inserted special tokens into the vocab, we need to pad the front # of idf_weights. We don't have real document frequencies for these tokens # so we will use an average of all idf_weights passed in as a reasonable # default. if insert_special_tokens: front_padding = num_special_tokens front_padding_value = np.average(idf_weights) else: front_padding = 0 front_padding_value = 0 # If pad_to_max_tokens is true, and max_tokens is greater than our total # vocab size, we need to pad the back of idf_weights with zeros as well. back_padding_value = 0 if self.pad_to_max_tokens and self.max_tokens is not None: back_padding = self.max_tokens - total_vocab_size else: back_padding = 0 idf_weights = np.pad(idf_weights, (front_padding, back_padding), "constant", constant_values=(front_padding_value, back_padding_value)) K.set_value(self.tf_idf_weights, idf_weights) return total_vocab_size
def _set_forward_vocabulary(self, vocab): """Sets vocabulary data for this layer when inverse is False.""" table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None has_mask = vocab[0] == self.mask_token oov_start = 1 if should_have_mask else 0 should_have_oov = (self.num_oov_indices > 0) and not self.invert if should_have_oov: oov_end = oov_start + self.num_oov_indices expected_oov = [self.oov_token] * self.num_oov_indices has_oov = vocab[oov_start:oov_end] == expected_oov # If we get a numpy array, then has_oov may end up being a numpy array # instead of a bool. Fix this by collapsing the variable if it's not bool. if not isinstance(has_oov, bool): has_oov = any(has_oov) else: has_oov = False if all([should_have_mask, has_mask, should_have_oov]) and not has_oov: raise ValueError("Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should" " be included in the provided vocabulary. " "The passed vocabulary has the correct mask token `%s` " "at index 0, but does not have the OOV token `%s` in " "indices [%s:%s]. Instead, we found `%s`. Was this " "vocabulary generated by a layer with incompatible " "settings?" % (self.mask_token, self.oov_token, self.mask_token, self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end])) if all([should_have_oov, has_oov, should_have_mask]) and not has_mask: raise ValueError( "Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should " "be included in the provided vocabulary. " "The passed vocabulary has the correct OOV token `%s` at " "indices [%s:%s], but does not have the mask token `%s` in " "index 0. Instead, we found `%s`. Was this vocabulary " "generated by a layer with incompatible settings?" % (self.mask_token, self.oov_token, self.oov_token, oov_start, oov_end, self.mask_token, vocab[0])) insert_special_tokens = not has_oov and not has_mask special_tokens = [] if self.mask_token is None else [self.mask_token] special_tokens.extend([self.oov_token] * self.num_oov_indices) num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError("Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if self.oov_token in tokens: raise ValueError("Reserved OOV token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "OOV token for this layer." % (self.oov_token, tokens.index(self.oov_token))) if insert_special_tokens: total_vocab_size = len(vocab) + num_special_tokens else: total_vocab_size = len(vocab) if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) start_index = num_special_tokens values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) self._table_handler.clear() self._table_handler.insert(vocab, values) if insert_special_tokens and num_special_tokens > 0: special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_tokens, special_token_values) return total_vocab_size