def _ragged_as_leaf_node( ragged_tensor: tf.RaggedTensor, is_repeated: bool, reference_ragged_tensor: tf.RaggedTensor, options: calculate_options.Options) -> prensor.LeafNodeTensor: """Creates a ragged tensor as a leaf node.""" assertions = [] size_dim = tf.compat.dimension_at_index(ragged_tensor.shape, 0).value reference_size_dim = tf.compat.dimension_at_index( reference_ragged_tensor.shape, 0).value if (size_dim is not None and reference_size_dim is not None): if size_dim != reference_size_dim: raise ValueError("Returned ragged tensor is not the right size.") elif options.ragged_checks: assertions.append( tf.assert_equal(ragged_tensor.nrows(), reference_ragged_tensor.nrows())) if not is_repeated: rowids = ragged_tensor.value_rowids() if options.ragged_checks: assertions.append( tf.compat.v1.assert_positive(rowids[1:] - rowids[:-1])) if assertions: with tf.control_dependencies(assertions): parent_index = ragged_tensor.value_rowids() return prensor.LeafNodeTensor(parent_index, ragged_tensor.values, is_repeated) else: parent_index = ragged_tensor.value_rowids() return prensor.LeafNodeTensor(parent_index, ragged_tensor.values, is_repeated)
def testMultipleColumnsTwoRowGroupsAndEqualBatchSize_OutputsPrensor(self): """Tests that the correct prensor for three columns is outputted.""" pq_ds = parquet.ParquetDataset(filenames=self._rowgroup_test_filenames, value_paths=[ "DocId", "Name.Language.Code", "Name.Language.Country" ], batch_size=2) expected_prensor = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)), path.Path(["DocId"]): prensor.LeafNodeTensor(tf.constant([0, 1], dtype=tf.int64), tf.constant([10, 20], dtype=tf.int64), True), path.Path(["Name"]): prensor.ChildNodeTensor(tf.constant([0, 0, 0, 1], dtype=tf.int64), True), path.Path(["Name", "Language"]): prensor.ChildNodeTensor(tf.constant([0, 0, 2], dtype=tf.int64), True), path.Path(["Name", "Language", "Code"]): prensor.LeafNodeTensor(tf.constant([0, 1, 2], dtype=tf.int64), tf.constant([b"en-us", b"en", b"en-gb"]), True), path.Path(["Name", "Language", "Country"]): prensor.LeafNodeTensor(tf.constant([0, 2], dtype=tf.int64), tf.constant([b"us", b"gb"]), True) }) for i, pren in enumerate(pq_ds): if i == 0: self._assertPrensorEqual(pren, expected_prensor)
def _ragged_as_leaf_node(ragged_tensor, is_repeated, reference_ragged_tensor, options): """Creates a ragged tensor as a leaf node.""" assertions = [] if (ragged_tensor.shape[0].value is not None and reference_ragged_tensor.shape[0].value is not None): if ragged_tensor.shape[0].value != reference_ragged_tensor.shape[ 0].value: raise ValueError("Returned ragged tensor is not the right size.") elif options.ragged_checks: assertions.append( tf.assert_equal(ragged_tensor.nrows(), reference_ragged_tensor.nrows())) if not is_repeated: rowids = ragged_tensor.value_rowids() if options.ragged_checks: assertions.append( tf.compat.v1.assert_positive(rowids[1:] - rowids[:-1])) if assertions: with tf.control_dependencies(assertions): parent_index = ragged_tensor.value_rowids() return prensor.LeafNodeTensor(parent_index, ragged_tensor.values, is_repeated) else: parent_index = ragged_tensor.value_rowids() return prensor.LeafNodeTensor(parent_index, ragged_tensor.values, is_repeated)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [origin_value, parent_value] = sources # We should never be recalculating a RootNodeTensor. assert not isinstance(origin_value, prensor.RootNodeTensor), origin_value # The parent cannot be a LeafNodeTensor or RootNodeTensor, because # a) a leaf node cannot have a submessage # b) you cannot broadcast into a root assert isinstance(parent_value, prensor.ChildNodeTensor), parent_value # We use equi_join_any_indices on the parent's `index_to_value` because it # represents which child nodes were duplicated. Thus, which origin values # also need to be duplicated. [broadcasted_to_sibling_index, index_to_values] = struct2tensor_ops.equi_join_any_indices( parent_value.index_to_value, origin_value.parent_index) if isinstance(origin_value, prensor.LeafNodeTensor): new_values = tf.gather(origin_value.values, index_to_values) return prensor.LeafNodeTensor(broadcasted_to_sibling_index, new_values, self.is_repeated) else: return prensor.ChildNodeTensor(broadcasted_to_sibling_index, self.is_repeated, index_to_values)
def calculate(self, sources, destinations, options): [origin_value, origin_parent_value] = sources if not isinstance(origin_value, (prensor.LeafNodeTensor, prensor.ChildNodeTensor)): raise ValueError( "origin_value must be a LeafNodeTensor or a ChildNodeTensor, " "but was a " + str(type(origin_value))) if not isinstance(origin_parent_value, (prensor.ChildNodeTensor, prensor.RootNodeTensor)): raise ValueError("origin_parent_value must be a ChildNodeTensor " "or a RootNodeTensor, but was a " + str(type(origin_parent_value))) parent_index = origin_value.parent_index num_parent_protos = origin_parent_value.size # A vector of 1s of the same size as the parent_index. updates = tf.ones(tf.shape(parent_index), dtype=tf.int64) indices = tf.expand_dims(parent_index, 1) # This is incrementing the size by 1 for each element. # Obviously, not the fastest way to do this. values = tf.scatter_nd(indices, updates, tf.reshape(num_parent_protos, [1])) # Need to create a new_parent_index = 0,1,2,3,4...n. new_parent_index = tf.range(num_parent_protos, dtype=tf.int64) return prensor.LeafNodeTensor(new_parent_index, values, False)
def calculate_from_parsed_field( self, parsed_field: struct2tensor_ops._ParsedField, # pylint: disable=protected-access destinations: Sequence[expression.Expression], options: calculate_options.Options) -> prensor.NodeTensor: return prensor.LeafNodeTensor(parsed_field.index, parsed_field.value, self.is_repeated)
def _to_leaf_prensor_helper(rt: tf.RaggedTensor, default_field_name: path.Step) -> prensor.Prensor: """Converts a fully partitioned ragged tensor to a leaf prensor. It is assumed that this is a fully partitioned ragged tensor. Specifically, the values at the end are a vector, not a 2D tensor. Args: rt: a fully partitioned ragged tensor (see _fully_partitioned_ragged_tensor). default_field_name: a path.Step for unnamed dimensions. Returns: a prensor, with a leaf as the root node. """ row_partition = rt._row_partition # pylint: disable=protected-access if rt.ragged_rank == 1: values = rt.values leaf = prensor.LeafNodeTensor(row_partition.value_rowids(), values, True) return prensor.create_prensor_from_root_and_children(leaf, {}) else: return _one_child_prensor( row_partition, _to_leaf_prensor_helper(rt.values, default_field_name), default_field_name)
def calculate(self, sources, destinations, options): [origin] = sources if isinstance(origin, (prensor.LeafNodeTensor, prensor.ChildNodeTensor)): return prensor.LeafNodeTensor( origin.parent_index, prensor_util.get_positional_index(origin), self.is_repeated) raise ValueError("Cannot calculate the positional index of the root")
def _as_leaf_node_no_checks(sparse_tensor, is_repeated): """Take a SparseTensor and create a LeafNodeTensor, no checks.""" if is_repeated: parent_index = tf.transpose(sparse_tensor.indices)[0] else: parent_index = tf.reshape(sparse_tensor.indices, [-1]) return prensor.LeafNodeTensor(parent_index, sparse_tensor.values, is_repeated)
def calculate(self, sources, destinations, options): source_leaves = [_leaf_node_or_error(s) for s in sources] source_values = [s.values for s in source_leaves] # TODO(martinz): Check that: # source_values have equal parent_index. # output_value has the same size as the input. return prensor.LeafNodeTensor(source_leaves[0].parent_index, self._operation(*source_values), self._is_repeated)
def testPromoteAndProjectExpression(self): filenames = [ "struct2tensor/testdata/parquet_testdata/dremel_example.parquet" ] batch_size = 2 exp = parquet.create_expression_from_parquet_file(filenames) new_exp = promote.promote(exp, path.Path(["Name", "Language", "Code"]), "new_code") new_code_project_exp = project.project( new_exp, [path.Path(["Name", "new_code"])]) docid_project_exp = project.project(exp, [path.Path(["DocId"])]) pqds = parquet.calculate_parquet_values( [new_code_project_exp, docid_project_exp], exp, filenames, batch_size) new_code_expected = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)), path.Path(["Name"]): prensor.ChildNodeTensor(tf.constant([0, 0, 0, 1], dtype=tf.int64), True), path.Path(["Name", "new_code"]): prensor.LeafNodeTensor(tf.constant([0, 0, 2], dtype=tf.int64), tf.constant([b"en-us", b"en", b"en-gb"]), True) }) docid_expected = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)), path.Path(["DocId"]): prensor.LeafNodeTensor(tf.constant([0, 1], dtype=tf.int64), tf.constant([10, 20], dtype=tf.int64), False) }) for ele in pqds: new_code_pren = ele[0] docid_pren = ele[1] self._assertPrensorEqual(new_code_pren, new_code_expected) self._assertPrensorEqual(docid_pren, docid_expected)
def testPlaceholderExpression(self): pren = prensor_test_util.create_nested_prensor() expected_pren = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(3, dtype=tf.int64)), path.Path(["new_friends"]): prensor.LeafNodeTensor( tf.constant([0, 1, 1, 1, 2], dtype=tf.int64), tf.constant(["a", "b", "c", "d", "e"], dtype=tf.string), True) }) root_schema = mpp.create_schema(is_repeated=True, children={ "doc": { "is_repeated": True, "children": { "bar": { "is_repeated": True, "dtype": tf.string }, "keep_me": { "is_repeated": False, "dtype": tf.bool } } }, "user": { "is_repeated": True, "children": { "friends": { "is_repeated": True, "dtype": tf.string } } } }) exp = placeholder.create_expression_from_schema(root_schema) promote_exp = promote.promote(exp, path.Path(["user", "friends"]), "new_friends") project_exp = project.project(promote_exp, [path.Path(["new_friends"])]) new_friends_exp = project_exp.get_descendant(path.Path(["new_friends" ])) result = calculate.calculate_values([new_friends_exp], feed_dict={exp: pren}) res_node = result[0] exp_node = expected_pren.get_descendant(path.Path(["new_friends" ])).node self.assertAllEqual(res_node.is_repeated, exp_node.is_repeated) self.assertAllEqual(res_node.values, exp_node.values) self.assertAllEqual(res_node.parent_index, exp_node.parent_index)
def calculate(self, sources, destinations, options, side_info=None): [origin_value, origin_parent_value] = sources if not isinstance(origin_value, prensor.LeafNodeTensor): raise ValueError("origin_value must be a leaf") if not isinstance(origin_parent_value, prensor.ChildNodeTensor): raise ValueError("origin_parent_value must be a child node") parent_to_grandparent_index = origin_parent_value.parent_index new_parent_index = tf.gather(parent_to_grandparent_index, origin_value.parent_index) return prensor.LeafNodeTensor(new_parent_index, origin_value.values, self.is_repeated)
def calculate(self, sources, destinations, options): [positional_index, size_value] = sources if not isinstance(positional_index, prensor.LeafNodeTensor): raise ValueError("positional_index must be a LeafNodeTensor") if not isinstance(size_value, prensor.LeafNodeTensor): raise ValueError("size_value must be a LeafNodeTensor") size_per_index = tf.gather(size_value.values, positional_index.parent_index) return prensor.LeafNodeTensor(positional_index.parent_index, positional_index.values - size_per_index, self.is_repeated)
def calculate(self, sources, destinations, options, side_info=None): [root_node] = sources # The following check ensures not just that we can calculate the value, # but that no "improper" reroots were done. if isinstance(root_node, prensor.RootNodeTensor): return prensor.LeafNodeTensor( _get_proto_index_parent_index(root_node), _get_input_proto_index(root_node), is_repeated=False) raise ValueError( "Illegal operation: expected a true root node: got {}".format( str(root_node)))
def create_repeated_leaf_node(parent_index, values): """Creates a repeated PrensorField. Args: parent_index: a list of integers that is converted to a 1-D int64 tensor. values: a list of whatever type that the field represents. Returns: A PrensorField with the parent_index and values set appropriately. """ return prensor.LeafNodeTensor(tf.constant(parent_index, dtype=tf.int64), tf.constant(values), True)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [origin] = sources if isinstance(origin, (prensor.LeafNodeTensor, prensor.ChildNodeTensor)): return prensor.LeafNodeTensor( origin.parent_index, prensor_util.get_positional_index(origin), self.is_repeated) raise ValueError("Cannot calculate the positional index of the root")
def create_optional_leaf_node(parent_index: Sequence[int], values: Sequence[Any]) -> prensor.LeafNodeTensor: """Creates an optional leaf node. Args: parent_index: a list of integers that is converted to a 1-D int64 tensor. values: a list of whatever type that the field represents. Returns: A PrensorField with the parent_index and values set appropriately. """ return prensor.LeafNodeTensor( tf.constant(parent_index, dtype=tf.int64), tf.constant(values), False)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: source_leaves = [_leaf_node_or_error(s) for s in sources] source_values = [s.values for s in source_leaves] # TODO(martinz): Check that: # source_values have equal parent_index. # output_value has the same size as the input. return prensor.LeafNodeTensor(source_leaves[0].parent_index, self._operation(*source_values), self._is_repeated)
def _filter_by_parent_indices_to_keep( node_value, parent_indices_to_keep): """Filter by parent indices to keep.""" [new_parent_index, self_indices_to_keep ] = struct2tensor_ops.equi_join_indices(parent_indices_to_keep, node_value.parent_index) if isinstance(node_value, prensor.ChildNodeTensor): return _FilterChildNodeTensor(new_parent_index, node_value.is_repeated, self_indices_to_keep) if isinstance(node_value, prensor.LeafNodeTensor): return prensor.LeafNodeTensor( new_parent_index, tf.gather(node_value.values, self_indices_to_keep), node_value.is_repeated) raise ValueError("Unknown NodeValue type")
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [origin_value, origin_parent_value] = sources if not isinstance(origin_value, prensor.LeafNodeTensor): raise ValueError("origin_value must be a leaf") if not isinstance(origin_parent_value, prensor.ChildNodeTensor): raise ValueError("origin_parent_value must be a child node") new_parent_index = tf.gather(origin_parent_value.parent_index, origin_value.parent_index) return prensor.LeafNodeTensor(new_parent_index, origin_value.values, self.is_repeated)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [positional_index, size_value] = sources if not isinstance(positional_index, prensor.LeafNodeTensor): raise ValueError("positional_index must be a LeafNodeTensor") if not isinstance(size_value, prensor.LeafNodeTensor): raise ValueError("size_value must be a LeafNodeTensor") size_per_index = tf.gather(size_value.values, positional_index.parent_index) return prensor.LeafNodeTensor(positional_index.parent_index, positional_index.values - size_per_index, self.is_repeated)
def _filter_by_self_indices_to_keep(node_value, self_indices_to_keep ): """Filter the node by the indices you want to keep.""" if isinstance(node_value, prensor.RootNodeTensor): return _FilterRootNodeTensor( tf.size(self_indices_to_keep), self_indices_to_keep) if isinstance(node_value, prensor.ChildNodeTensor): return _FilterChildNodeTensor( tf.gather(node_value.parent_index, self_indices_to_keep), node_value.is_repeated, self_indices_to_keep) if isinstance(node_value, prensor.LeafNodeTensor): return prensor.LeafNodeTensor( tf.gather(node_value.parent_index, self_indices_to_keep), tf.gather(node_value.values, self_indices_to_keep), node_value.is_repeated) raise ValueError("Unknown NodeValue type")
def get_mock_leaf(is_repeated, my_type, name=None, source_expressions=None, calculate_is_identity=False): """Gets a leaf expression.""" if calculate_is_identity: calculate_output = source_expressions[0].calculate_output else: calculate_output = prensor.LeafNodeTensor( tf.constant([], dtype=tf.int64), tf.constant([], dtype=my_type), is_repeated) return MockExpression(is_repeated, my_type, name=name, source_expressions=source_expressions, calculate_output=calculate_output, calculate_is_identity=calculate_is_identity)
def calculate(self, sources, destinations, options): [origin_value, sibling_value] = sources if not isinstance(origin_value, prensor.LeafNodeTensor): raise ValueError("origin not a LeafNodeTensor") if not isinstance(sibling_value, prensor.ChildNodeTensor): raise ValueError("sibling value is not a ChildNodeTensor") sibling_to_parent_index = sibling_value.parent_index # For each i, for each v, if there exist exactly n values j such that: # sibling_to_parent_index[i]==origin_value.parent_index[j] # then there exists exactly n values k such that: # new_parent_index[k] = i # new_values[k] = origin_value.values[j] # (Ordering is also preserved). [broadcasted_to_sibling_index, index_to_values ] = struct2tensor_ops.equi_join_indices(sibling_to_parent_index, origin_value.parent_index) new_values = tf.gather(origin_value.values, index_to_values) return prensor.LeafNodeTensor(broadcasted_to_sibling_index, new_values, self.is_repeated)
def get_mock_broken_leaf( declared_is_repeated, declared_type, actual_is_repeated, actual_type, name = None, source_expressions = None, calculate_is_identity = False): """Gets a leaf expression flexible enough not to typecheck. If declared_is_repeated != actual_is_repeated, or declared_type != actual_type, then this will not typecheck when _ExpressionNode.calculate() is called. Args: declared_is_repeated: the is_repeated of the expression. declared_type: the type of the expression. actual_is_repeated: the is_repeated of the NodeTensor. actual_type: the type of the NodeTensor. name: a name of the expression. source_expressions: the result of get_source expressions() calculate_is_identity: true iff this should say it is the identity. Returns: An expression. """ if calculate_is_identity: calculate_output = source_expressions[0].calculate_output else: calculate_output = prensor.LeafNodeTensor( tf.constant([], dtype=tf.int64), tf.constant([], dtype=actual_type), actual_is_repeated) return MockExpression( declared_is_repeated, declared_type, name=name, source_expressions=source_expressions, calculate_output=calculate_output, calculate_is_identity=calculate_is_identity)
def calculate_from_parsed_field(self, parsed_field, destinations): return prensor.LeafNodeTensor(parsed_field.index, parsed_field.value, self.is_repeated)
def calculate_from_parsed_field( self, parsed_field: struct2tensor_ops._ParsedField, destinations: Sequence[expression.Expression] ) -> prensor.NodeTensor: return prensor.LeafNodeTensor(parsed_field.index, parsed_field.value, self.is_repeated)