def testLayoutAndMeshShape(self): # Same as previous test, but don't specify a 4x2 mesh. mtf_graph = mtf.Graph() mesh = mtf.Mesh(mtf_graph, "my_mesh") x = mtf.zeros(mesh, "a:10,b:5") y = mtf.zeros(mesh, "b:5,c:20") z = mtf.einsum([x, y], "a:10,c:20") layout, mesh_shape = mtf.auto_mtf.layout_and_mesh_shape(mtf_graph, 8, [z]) a_dim = mtf.convert_to_dimension(("a", 10)) b_dim = mtf.convert_to_dimension(("b", 5)) c_dim = mtf.convert_to_dimension(("c", 20)) self.assertEqual(layout.tensor_dimension_to_mesh_axis(a_dim, mesh_shape), 1) self.assertIsNone(layout.tensor_dimension_to_mesh_axis(b_dim, mesh_shape)) self.assertEqual(layout.tensor_dimension_to_mesh_axis(c_dim, mesh_shape), 0) self.assertCountEqual(mesh_shape.dims, [mtf.Dimension("mesh_0", 4), mtf.Dimension("mesh_1", 2)]) layout, mesh_shape = mtf.auto_mtf.layout_and_mesh_shape( mtf_graph, 8, [z], 1) self.assertIsNone(layout.tensor_dimension_to_mesh_axis(a_dim, mesh_shape)) self.assertIsNone(layout.tensor_dimension_to_mesh_axis(b_dim, mesh_shape)) self.assertIsNone(layout.tensor_dimension_to_mesh_axis(c_dim, mesh_shape)) self.assertCountEqual(mesh_shape.dims, [mtf.Dimension("mesh_0", 8)])
def testLayout(self): # Construct a Mesh TensorFlow graph and mesh. mtf_graph = mtf.Graph() mesh = mtf.Mesh(mtf_graph, "my_mesh") x = mtf.zeros(mesh, "a:10,b:5") y = mtf.zeros(mesh, "b:5,c:20") z = mtf.einsum([x, y], "a:10,c:20") # Decide on a mesh shape. mesh_shape = mtf.convert_to_shape("m1:4,m2:2") # Compute a layout based on the graph and mesh. # Note that knowing the identity of the outputs is important to the # optimization since they cannot be freed. layout = mtf.auto_mtf.layout(mtf_graph, mesh_shape, [z]) a_dim = mtf.convert_to_dimension(("a", 10)) b_dim = mtf.convert_to_dimension(("b", 5)) c_dim = mtf.convert_to_dimension(("c", 20)) self.assertEqual( layout.tensor_dimension_to_mesh_axis(a_dim, mesh_shape), 1) self.assertIsNone( layout.tensor_dimension_to_mesh_axis(b_dim, mesh_shape)) self.assertEqual( layout.tensor_dimension_to_mesh_axis(c_dim, mesh_shape), 0)
def __init__( self, # pylint: disable=super-init-not-called key_heads_dims=(("heads", 12), ), softmax_heads_dims=(("heads", 12), ), value_heads_dims=(("heads", 12), ), key_size=64, value_size=64, dropout_rate=0.0, relative_attention_type=None, relative_attention_num_buckets=32, dynamic_projections=None, dynamic_projections_init_scale=1e-2): """Create a SelfAttention Layer. Args: key_heads_dims: a list of mtf.Dimension or (name, size) pairs softmax_heads_dims: a list of mtf.Dimension or (name, size) pairs value_heads_dims: a list of mtf.Dimension or (name, size) pairs key_size: an integer value_size: an integer dropout_rate: a float relative_attention_type: an optional string - one of (None, "bias", "bias_shared", "contextual") relative_attention_num_buckets: an integer dynamic_projections: an optional sequence containing a subset of ["x2l", "m2l", "x2w", "m2w"] (see class comments) dynamic_projections_init_scale: a float - initializer variance scaling factor for these dynamic projections. We have observed learning difficulties when this value is too large. """ self.key_heads_dims = [ mtf.convert_to_dimension(d) for d in key_heads_dims ] self.softmax_heads_dims = [ mtf.convert_to_dimension(d) for d in softmax_heads_dims ] self.value_heads_dims = [ mtf.convert_to_dimension(d) for d in value_heads_dims ] self.key_dim = mtf.Dimension("d_k", key_size) self.value_dim = mtf.Dimension("d_v", value_size) self.dropout_rate = dropout_rate self.relative_attention_type = relative_attention_type self.relative_attention_num_buckets = relative_attention_num_buckets self.dynamic_projections = dynamic_projections or [] self.dynamic_projections_init_scale = dynamic_projections_init_scale
def __init__(self, mesh, shape, dtype, name=None): super(IndicesOperation, self).__init__([], mesh, name=name or "indices") self._mesh = mesh self._shape = [mtf.convert_to_dimension(dim) for dim in shape] self._dtype = dtype self._outputs = [ mtf.Tensor( self, mtf.Shape(self._shape + [mtf.Dimension("ndim", len(self._shape))]), dtype) ]
def __init__(self, # pylint: disable=super-init-not-called heads_dims=(("heads", 12),), dropout_rate=0.0, relative_attention_type=None, relative_attention_num_buckets=32): """Create a GeneralBilinearSelfAttention Layer. Args: heads_dims: a list of mtf.Dimension or (name, size) pairs dropout_rate: a float relative_attention_type: an optional string - one of (None, "bias", "bias_shared", "contextual") relative_attention_num_buckets: an integer """ self.heads_dims = [ mtf.convert_to_dimension(d) for d in heads_dims] self.dropout_rate = dropout_rate self.relative_attention_type = relative_attention_type self.relative_attention_num_buckets = relative_attention_num_buckets
def testConvertToDimensionGenericInputs(self): dimension = mtf.convert_to_dimension(None) self.assertEqual(dimension, None) with self.assertRaises(TypeError): mtf.convert_to_dimension(5)
def testConvertToDimension(self, inputs): dimension = mtf.convert_to_dimension(inputs) self.assertEqual(dimension.name, "x") self.assertEqual(dimension.size, 5)