Exemple #1
0
    def testMultipleColumnsTwoRowGroupsAndEqualBatchSize_OutputsPrensor(self):
        """Tests that the correct prensor for three columns is outputted."""
        pq_ds = parquet.ParquetDataset(filenames=self._rowgroup_test_filenames,
                                       value_paths=[
                                           "DocId", "Name.Language.Code",
                                           "Name.Language.Country"
                                       ],
                                       batch_size=2)
        expected_prensor = prensor.create_prensor_from_descendant_nodes({
            path.Path([]):
            prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)),
            path.Path(["DocId"]):
            prensor.LeafNodeTensor(tf.constant([0, 1], dtype=tf.int64),
                                   tf.constant([10, 20], dtype=tf.int64),
                                   True),
            path.Path(["Name"]):
            prensor.ChildNodeTensor(tf.constant([0, 0, 0, 1], dtype=tf.int64),
                                    True),
            path.Path(["Name", "Language"]):
            prensor.ChildNodeTensor(tf.constant([0, 0, 2], dtype=tf.int64),
                                    True),
            path.Path(["Name", "Language", "Code"]):
            prensor.LeafNodeTensor(tf.constant([0, 1, 2], dtype=tf.int64),
                                   tf.constant([b"en-us", b"en", b"en-gb"]),
                                   True),
            path.Path(["Name", "Language", "Country"]):
            prensor.LeafNodeTensor(tf.constant([0, 2], dtype=tf.int64),
                                   tf.constant([b"us", b"gb"]), True)
        })

        for i, pren in enumerate(pq_ds):
            if i == 0:
                self._assertPrensorEqual(pren, expected_prensor)
Exemple #2
0
    def calculate(
            self,
            sources: Sequence[prensor.NodeTensor],
            destinations: Sequence[expression.Expression],
            options: calculate_options.Options,
            side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor:
        [origin_value, parent_value] = sources

        # We should never be recalculating a RootNodeTensor.
        assert not isinstance(origin_value,
                              prensor.RootNodeTensor), origin_value

        # The parent cannot be a LeafNodeTensor or RootNodeTensor, because
        #  a) a leaf node cannot have a submessage
        #  b) you cannot broadcast into a root
        assert isinstance(parent_value, prensor.ChildNodeTensor), parent_value

        # We use equi_join_any_indices on the parent's `index_to_value` because it
        # represents which child nodes were duplicated. Thus, which origin values
        # also need to be duplicated.
        [broadcasted_to_sibling_index,
         index_to_values] = struct2tensor_ops.equi_join_any_indices(
             parent_value.index_to_value, origin_value.parent_index)

        if isinstance(origin_value, prensor.LeafNodeTensor):
            new_values = tf.gather(origin_value.values, index_to_values)
            return prensor.LeafNodeTensor(broadcasted_to_sibling_index,
                                          new_values, self.is_repeated)
        else:
            return prensor.ChildNodeTensor(broadcasted_to_sibling_index,
                                           self.is_repeated, index_to_values)
Exemple #3
0
 def calculate(
         self,
         sources: Sequence[prensor.NodeTensor],
         destinations: Sequence[expression.Expression],
         options: calculate_options.Options,
         side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor:
     [origin_value, origin_parent_value] = sources
     if not isinstance(origin_value, prensor.ChildNodeTensor):
         raise ValueError("origin_value must be a child")
     if not isinstance(origin_parent_value, prensor.ChildNodeTensor):
         raise ValueError("origin_parent_value must be a child node")
     new_parent_index = tf.gather(origin_parent_value.parent_index,
                                  origin_value.parent_index)
     return prensor.ChildNodeTensor(new_parent_index, self.is_repeated)
Exemple #4
0
    def calculate(
            self,
            sources: Sequence[prensor.NodeTensor],
            destinations: Sequence[expression.Expression],
            options: calculate_options.Options,
            side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor:
        [origin_value, sibling_value] = sources
        if not isinstance(origin_value, prensor.ChildNodeTensor):
            raise ValueError("origin not a ChildNodeTensor")
        if not isinstance(sibling_value, prensor.ChildNodeTensor):
            raise ValueError("sibling value is not a ChildNodeTensor")

        [broadcasted_to_sibling_index,
         index_to_values] = struct2tensor_ops.equi_join_any_indices(
             sibling_value.parent_index, origin_value.parent_index)
        return prensor.ChildNodeTensor(broadcasted_to_sibling_index,
                                       self.is_repeated,
                                       index_to_value=index_to_values)
Exemple #5
0
    def testPromoteAndProjectExpression(self):
        filenames = [
            "struct2tensor/testdata/parquet_testdata/dremel_example.parquet"
        ]
        batch_size = 2
        exp = parquet.create_expression_from_parquet_file(filenames)
        new_exp = promote.promote(exp, path.Path(["Name", "Language", "Code"]),
                                  "new_code")
        new_code_project_exp = project.project(
            new_exp, [path.Path(["Name", "new_code"])])
        docid_project_exp = project.project(exp, [path.Path(["DocId"])])

        pqds = parquet.calculate_parquet_values(
            [new_code_project_exp, docid_project_exp], exp, filenames,
            batch_size)

        new_code_expected = prensor.create_prensor_from_descendant_nodes({
            path.Path([]):
            prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)),
            path.Path(["Name"]):
            prensor.ChildNodeTensor(tf.constant([0, 0, 0, 1], dtype=tf.int64),
                                    True),
            path.Path(["Name", "new_code"]):
            prensor.LeafNodeTensor(tf.constant([0, 0, 2], dtype=tf.int64),
                                   tf.constant([b"en-us", b"en", b"en-gb"]),
                                   True)
        })

        docid_expected = prensor.create_prensor_from_descendant_nodes({
            path.Path([]):
            prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)),
            path.Path(["DocId"]):
            prensor.LeafNodeTensor(tf.constant([0, 1], dtype=tf.int64),
                                   tf.constant([10, 20], dtype=tf.int64),
                                   False)
        })

        for ele in pqds:
            new_code_pren = ele[0]
            docid_pren = ele[1]

            self._assertPrensorEqual(new_code_pren, new_code_expected)
            self._assertPrensorEqual(docid_pren, docid_expected)
Exemple #6
0
def create_child_node(parent_index: Sequence[int],
                      is_repeated: bool) -> prensor.ChildNodeTensor:
  return prensor.ChildNodeTensor(
      tf.constant(parent_index, dtype=tf.int64), is_repeated)
Exemple #7
0
def _row_partition_to_child_node_tensor(row_partition: RowPartition):
  """Creates a ChildNodeTensor from a RowPartition."""
  return prensor.ChildNodeTensor(
      row_partition.with_row_splits_dtype(tf.int64).value_rowids(),
      is_repeated=True)
def create_child_node(parent_index, is_repeated):
    return prensor.ChildNodeTensor(tf.constant(parent_index, dtype=tf.int64),
                                   is_repeated)
def _row_partition_to_child_node_tensor(row_partition: RowPartition):
    """Creates a ChildNodeTensor from a RowPartition."""
    return prensor.ChildNodeTensor(tf.cast(row_partition.value_rowids(),
                                           tf.int64),
                                   is_repeated=True)