def test_generate_remapping_with_offset(self): """Tests offset and num_new_vocab logic.""" remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping( new_vocab_file=self.new_vocab_file, old_vocab_file=self.old_vocab_file, num_new_vocab=1, new_vocab_offset=1) expected_remapping = [0] expected_num_present = 1 with self.test_session(): self.assertAllEqual(expected_remapping, remapping.eval()) self.assertAllEqual(expected_num_present, num_present.eval())
def test_generate_remapping_with_shifted_vocab(self): """Tests where vocab is the same, but shifted / ordered differently.""" remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping( new_vocab_file=self.new_vocab_file, old_vocab_file=self.old_vocab_file, num_new_vocab=3, new_vocab_offset=0) expected_remapping = [2, 0, 1] expected_num_present = 3 with self.test_session(): self.assertAllEqual(expected_remapping, remapping.eval()) self.assertAllEqual(expected_num_present, num_present.eval())
def test_generate_remapping_with_no_vocab_changes(self): """Tests where vocab does not change at all.""" remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping( new_vocab_file=self.old_vocab_file, old_vocab_file=self.old_vocab_file, num_new_vocab=3, new_vocab_offset=0) expected_remapping = range(0, 3) expected_num_present = 3 with self.test_session(): self.assertAllEqual(expected_remapping, remapping.eval()) self.assertAllEqual(expected_num_present, num_present.eval())
def _load_and_remap_matrix(ckpt_path, old_tensor_name, new_row_vocab_offset, num_rows_to_load, new_col_vocab_size, initializer, old_row_vocab_file=None, new_row_vocab_file=None, old_col_vocab_file=None, new_col_vocab_file=None, num_row_oov_buckets=0, num_col_oov_buckets=0): """Loads a 2-D (matrix) `Tensor` from checkpoint. Generates 1D-remappings for rows and columns using the `GenerateVocabRemapping` op, and initializes any anticipated values with the provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a matrix that loads existing values from the checkpoint, while filling out "missing" values with the newly initialized values. See contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped functionality (LoadAndRemapMatrix). This wrapper can be used to perform only row remapping or only col remapping. If only row remapping is desired, {new,old}_col_vocab_file should be `None`, and vice versa for column remapping. NOTE: This only supports div-partitioning the vocabulary on the 1st dimension (row axis) via `new_row_vocab_offset`. Args: ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from which the old matrix `Tensor` will be loaded. old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. new_row_vocab_offset: A 0-indexed integer representing what line to start reading at in the new row vocabulary. Used for partitioned variables. num_rows_to_load: Number of rows to load for the new vocabulary (note: to support variable partitioning and partial loading, this does not need to be the same as the number of entries in `new_row_vocab_file`). new_col_vocab_size: Number of columns to load - should be the same as the number of entries in `new_col_vocab_file`, since we don't support partitioning along the column axis. initializer: Callable initializer function that accepts a 1-D tensor as the arg to specify the shape of the returned tensor. Used to initialize missing values. old_row_vocab_file: A scalar `Tensor` of type `string` containing the path to the old row vocabulary file. Can be None, which represents no remapping on the row axis. new_row_vocab_file: A scalar `Tensor` of type `string` containing the path to the new row vocabulary file. Can be None, which represents no remapping on the row axis - in which case, `new_row_vocab_offset` and `num_rows_to_load` work under the assumption that the new row vocab is the same as the old row vocab. old_col_vocab_file: A scalar `Tensor` of type `string` containing the path to the old column vocabulary file. Can be None, which represents no remapping on the column axis. new_col_vocab_file: A scalar `Tensor` of type `string` containing the path to the new column vocabulary file. Can be None, which represents no remapping on the column axis - in which case, `new_col_vocab_size` works under the assumption that the new col vocab is the same as the old col vocab. num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows to append. Must be >= 0. num_col_oov_buckets: `int` specifying the number of out-of-vocabulary columns to append. Must be >= 0. Returns: A Tensor of shape `[num_rows_to_load + num_row_oov_buckets, new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the specified tensor in the checkpoint, and any missing or OOV values initialized with the given `initializer`. Raises: ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0. ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is provided, while the other is not. Same for `old_col_vocab_file` and `new_col_vocab_file`. ValueError: If neither row vocabs or col vocabs are provided. """ if num_row_oov_buckets < 0: raise ValueError("num_row_oov_buckets must be >= 0, but received %d" % num_row_oov_buckets) if num_col_oov_buckets < 0: raise ValueError("num_col_oov_buckets must be >= 0, but received %d" % num_col_oov_buckets) if bool(old_row_vocab_file) != bool(new_row_vocab_file): raise ValueError( "old_row_vocab_file and new_row_vocab_file must both be specified or " "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'". format(old_row_vocab_file, new_row_vocab_file)) if bool(old_col_vocab_file) != bool(new_col_vocab_file): raise ValueError( "old_col_vocab_file and new_col_vocab_file must both be specified or " "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'". format(old_col_vocab_file, new_col_vocab_file)) remap_rows = new_row_vocab_file and old_row_vocab_file remap_cols = new_col_vocab_file and old_col_vocab_file if not (remap_rows or remap_cols): raise ValueError( "Must provide either row or column vocab files. If no remapping is " "necessary, consider using `tf.contrib.framework.init_from_checkpoint` " "instead.") num_rows_present = num_rows_to_load if remap_rows: row_remapping, num_rows_present = ( gen_checkpoint_ops.generate_vocab_remapping( new_vocab_file=new_row_vocab_file, old_vocab_file=old_row_vocab_file, new_vocab_offset=new_row_vocab_offset, num_new_vocab=num_rows_to_load)) else: # Even when the rows are not being reordered, we still need to generate a # remapping to account for initializing partitioned Variables (when # new_row_vocab_offset is non-zero). row_remapping = math_ops.range( new_row_vocab_offset, new_row_vocab_offset + num_rows_to_load, dtype=dtypes.int64) col_remapping = [] num_cols_present = new_col_vocab_size if remap_cols: col_remapping, num_cols_present = ( gen_checkpoint_ops.generate_vocab_remapping( new_vocab_file=new_col_vocab_file, old_vocab_file=old_col_vocab_file, new_vocab_offset=0, # Offset is unused for cols (no partitioning). num_new_vocab=new_col_vocab_size)) init_vals = initializer([ num_rows_to_load * new_col_vocab_size - num_rows_present * num_cols_present, 1 ]) return_tensor = gen_checkpoint_ops.load_and_remap_matrix( ckpt_path=ckpt_path, old_tensor_name=old_tensor_name, row_remapping=row_remapping, col_remapping=col_remapping, initializing_values=init_vals, num_rows=num_rows_to_load, num_cols=new_col_vocab_size) # Add OOV row(s) and column(s). if num_row_oov_buckets > 0: init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size]) init_row_oov_val = ops.convert_to_tensor(init_row_oov_val) return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0) if num_col_oov_buckets > 0: # We need to add any row OOV to the new column shape. init_col_oov_val = initializer( [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets]) init_col_oov_val = ops.convert_to_tensor(init_col_oov_val) return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1) return return_tensor