def test_dash_in_name(self): simple_step = create_path("foo-bar.baz") self.assertEqual(str(simple_step), "foo-bar.baz") first_extension = create_path("(foo-bar.bak).baz") self.assertEqual(str(first_extension), "(foo-bar.bak).baz") last_extension = create_path("(foo.bar-bak).baz") self.assertEqual(str(last_extension), "(foo.bar-bak).baz")
def promote_and_broadcast( self, path_dictionary: Mapping[path.Step, CoercableToPath], dest_path_parent: CoercableToPath) -> "Expression": return promote_and_broadcast.promote_and_broadcast( self, {k: path.create_path(v) for k, v in path_dictionary.items()}, path.create_path(dest_path_parent))
def test_is_ancestor(self): ancestor = create_path("foo.bar") descendant = create_path("foo.bar.baz") not_ancestor = create_path("fuzz") self.assertEqual(ancestor.is_ancestor(descendant), True) self.assertEqual(ancestor.is_ancestor(ancestor), True) self.assertEqual(not_ancestor.is_ancestor(descendant), False) self.assertEqual(descendant.is_ancestor(ancestor), False)
def test_add(self): # Test add two paths. self.assertEqual( create_path("foo.bar") + create_path("baz.bax"), create_path("foo.bar.baz.bax")) # Test add a path with a string. self.assertEqual( create_path("foo.bar") + "baz.bax", create_path("foo.bar.baz.bax"))
def test_cmp(self): self.assertGreater(Path([1]), Path("foo")) self.assertLess(Path("foo"), Path([1])) self.assertGreater(Path([1]), Path([0])) self.assertGreater(create_path("foo.baz"), create_path("foo")) self.assertGreater(create_path("foo.baz"), create_path("foo.bar")) self.assertLess(create_path("foo"), create_path("foo.bar")) self.assertLess(create_path("foo.bar"), create_path("foo.baz")) self.assertEqual(create_path("foo.baz"), create_path("foo.baz"))
def test_valid_map_indexing_step(self): self.assertSequenceEqual( ["my_map[some_key]", "some_value"], create_path("my_map[some_key].some_value").field_list) self.assertSequenceEqual(["my_map[]", "some_value"], create_path("my_map[].some_value").field_list) self.assertSequenceEqual( ["my_map[key.1]", "some_value"], create_path("my_map[key.1].some_value").field_list) self.assertSequenceEqual( ["my_map[(key)]", "some_value"], create_path("my_map[(key)].some_value").field_list) self.assertSequenceEqual( ["my_map[[]", "some_value"], create_path("my_map[[].some_value").field_list)
def _create_prensor_spec(self) -> prensor._PrensorTypeSpec: # pylint: disable=protected-access """Creates the prensor type spec based on value_paths. Returns: a root _PrensorTypeSpec. """ metadata = pq.ParquetFile(self._filenames[0]).metadata parquet_schema = metadata.schema arrow_schema = parquet_schema.to_arrow_schema() # pylint: disable=protected-access # Sort the paths by number of fields. paths = [path.create_path(p) for p in self._value_paths] mapped = zip(paths, self._value_paths, self._value_dtypes) sorted_mapped = sorted(mapped, key=lambda x: len(x[0].field_list)) paths, self._value_paths, self._value_dtypes = zip(*sorted_mapped) # Creates an ordered dictionary mapping step to a list of children fields. # This will allow us to find paths that share a parent. curr_steps_as_set = collections.OrderedDict() for (i, p) in enumerate(paths): step = p.field_list[0] if step in curr_steps_as_set: curr_steps_as_set[step].append((i, p.field_list[1:])) else: curr_steps_as_set[step] = [(i, p.field_list[1:])] return prensor._PrensorTypeSpec( None, prensor._PrensorTypeSpec._NodeType.ROOT, tf.int64, [ self._create_children_spec( arrow_schema.field(step), curr_steps_as_set[step]) for step in curr_steps_as_set ])
def test_get_sparse_tensor(self): expression = prensor_test_util.create_simple_prensor() sparse_tensor = prensor._get_sparse_tensor(expression, path.create_path("foo")) self.assertAllEqual(sparse_tensor.indices, [[0], [1], [2]]) self.assertAllEqual(sparse_tensor.dense_shape, [3]) self.assertAllEqual(sparse_tensor.values, [9, 8, 7])
def test_prensor_to_ragged_tensor(self): for options in _OPTIONS_TO_TEST: pren = prensor_test_util.create_nested_prensor() ragged_tensor = pren.get_ragged_tensor(path.create_path("doc.bar"), options) self.assertAllEqual(ragged_tensor, [[[b"a"]], [[b"b", b"c"], [b"d"]], []])
def test_filter_by_child_create_nested_prensor_2(self): """Tests filter_by_child. In particular, it checks for the case where parent_index != self index. """ root = create_expression.create_expression_from_prensor( _create_nested_prensor_2()) root_2 = filter_expression.filter_by_child(root, path.create_path("doc"), "keep_me", "new_doc") [result] = calculate.calculate_prensors([root_2]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc" ])).node.parent_index, [1]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "keep_me" ])).node.parent_index, [0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "keep_me"])).node.values, [True]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar" ])).node.parent_index, [0, 0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar"])).node.values, [b"b", b"c"])
def map_ragged_tensors(self, parent_path, source_fields, operator, is_repeated, dtype, new_field_name): """Maps a set of primitive fields of a message to a new field. Unlike map_field_values, this operation allows you to some degree reshape the field. For instance, you can take two optional fields and create a repeated field, or perform a reduce_sum on the last dimension of a repeated field and create an optional field. The key constraint is that the operator must return a sparse tensor of the correct dimension: i.e., a 2D sparse tensor if is_repeated is true, or a 1D sparse tensor if is_repeated is false. Moreover, the first dimension of the sparse tensor must be equal to the first dimension of the input tensor. Args: parent_path: the parent of the input and output fields. source_fields: the nonempty list of names of the source fields. operator: an operator that takes len(source_fields) sparse tensors and returns a sparse tensor of the appropriate shape. is_repeated: whether the output is repeated. dtype: the dtype of the result. new_field_name: the name of the resulting field. Returns: A new query. """ return map_prensor.map_ragged_tensor( self, path.create_path(parent_path), [path.Path([f]) for f in source_fields], operator, is_repeated, dtype, new_field_name)
def testDeepStructuredTensor(self): rt = tf.RaggedTensor.from_value_rowids( tf.constant([[1, 2], [3, 4], [5, 6]]), [0, 0, 1]) struct = _make_structured_tensor([2], {"r": rt}) struct_2 = struct.partition_outer_dimension( RowPartition.from_row_splits([0, 1, 2])) p = structured_tensor_to_prensor.structured_tensor_to_prensor(struct_2) rt_value = p.get_descendant(path.create_path("data.r.data")) self.assertAllEqual(rt_value.node.parent_index, [0, 0, 1, 1, 2, 2]) self.assertAllEqual(rt_value.node.values, [1, 2, 3, 4, 5, 6]) p_data = p.get_descendant(path.create_path("data")) self.assertAllEqual(p_data.node.parent_index, [0, 1]) p_data_r = p.get_descendant(path.create_path("data.r")) self.assertAllEqual(p_data_r.node.parent_index, [0, 0, 1])
def test_filter_by_child(self): """Tests filter_by_child.""" root = create_expression.create_expression_from_prensor( prensor_test_util.create_big_prensor()) root_2 = filter_expression.filter_by_child(root, path.create_path("doc"), "keep_me", "new_doc") [result] = calculate.calculate_prensors([root_2]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc" ])).node.parent_index, [1]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "keep_me" ])).node.parent_index, [0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "keep_me"])).node.values, [True]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar" ])).node.parent_index, [0, 0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar"])).node.values, [b"b", b"c"])
def test_filter_by_child_create_nested_prensor(self): """Tests filter_by_child.""" with self.session(use_gpu=False) as sess: root = create_expression.create_expression_from_prensor( _create_nested_prensor()) root_2 = filter_expression.filter_by_child(root, path.create_path("doc"), "keep_me", "new_doc") result = prensor_value.materialize( calculate.calculate_prensors([root_2])[0], sess) self.assertAllEqual( result.get_descendant_or_error(path.Path( ["new_doc"])).node.parent_index, [1]) self.assertAllEqual( result.get_descendant_or_error( path.Path(["new_doc", "keep_me"])).node.parent_index, [0]) self.assertAllEqual( result.get_descendant_or_error( path.Path(["new_doc", "keep_me"])).node.values, [True]) self.assertAllEqual( result.get_descendant_or_error(path.Path( ["new_doc", "bar"])).node.parent_index, [0, 0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar"])).node.values, [b"b", b"c"])
def test_prensor_to_sparse_tensor(self): for options in _OPTIONS_TO_TEST: pren = prensor_test_util.create_simple_prensor() sparse_tensor = pren.get_sparse_tensor(path.create_path("foo"), options=options) self.assertAllEqual(sparse_tensor.indices, [[0], [1], [2]]) self.assertAllEqual(sparse_tensor.dense_shape, [3]) self.assertAllEqual(sparse_tensor.values, [9, 8, 7])
def test_get_ragged_tensor(self): """Tests get_ragged_tensor on a deep field.""" for options in _OPTIONS_TO_TEST: expression = prensor_test_util.create_nested_prensor() ragged_tensor = prensor._get_ragged_tensor( expression, path.create_path("doc.bar"), options) self.assertAllEqual(ragged_tensor, [[[b"a"]], [[b"b", b"c"], [b"d"]], []])
def test_prefix(self): original = create_path("foo.bar.baz") self.assertEqual(str(original.prefix(0)), "") self.assertEqual(str(original.prefix(1)), "foo") self.assertEqual(str(original.prefix(2)), "foo.bar") self.assertEqual(str(original.prefix(3)), "foo.bar.baz") self.assertEqual(str(original.prefix(-1)), "foo.bar") self.assertEqual(str(original.prefix(-2)), "foo")
def testStructuredTensorCreation(self): rt = tf.RaggedTensor.from_value_rowids( tf.constant([[1, 2], [3, 4], [5, 6]]), [0, 0, 1]) struct = _make_structured_tensor([2], {"r": rt}) p = structured_tensor_to_prensor.structured_tensor_to_prensor(struct) rt_value = p.get_descendant(path.create_path("r.data")) self.assertAllEqual(rt_value.node.parent_index, [0, 0, 1, 1, 2, 2]) self.assertAllEqual(rt_value.node.values, [1, 2, 3, 4, 5, 6])
def test_filter_by_sibling(self): r"""Tests filter_by_sibling. Beginning with the struct: -----*---------------------------------------------------- / \ \ root0 root1----------------------- root2 (empty) / \ / \ \ \ | keep_my_sib0:False | keep_my_sib1:True | keep_my_sib2:False doc0----- doc1--------------- doc2-------- | \ \ \ \ \ bar:"a" keep_me:False bar:"b" bar:"c" keep_me:True bar:"d" Filter doc with keep_my_sib: End with the struct (suppressing original doc): -----*---------------------------------------------------- / \ \ root0 root1------------------ root2 (empty) \ / \ \ keep_my_sib0:False | keep_my_sib1:True keep_my_sib2:False new_doc0----------- \ \ \ bar:"b" bar:"c" keep_me:True """ root = create_expression.create_expression_from_prensor( _create_nested_prensor()) root_2 = filter_expression.filter_by_sibling(root, path.create_path("doc"), "keep_my_sib", "new_doc") [result] = calculate.calculate_prensors([root_2]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc" ])).node.parent_index, [1]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "keep_me" ])).node.parent_index, [0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "keep_me"])).node.values, [True]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar" ])).node.parent_index, [0, 0]) self.assertAllEqual( result.get_descendant_or_error(path.Path(["new_doc", "bar"])).node.values, [b"b", b"c"])
def map_field_values(self, source_path, operator, dtype, new_field_name): """Map a primitive field to create a new primitive field. Note: the dtype argument is added since the v1 API. Args: source_path: the origin path. operator: an element-wise operator that takes a 1-dimensional vector. dtype: the type of the output. new_field_name: the name of a new sibling of source_path. Returns: the resulting root expression. """ return map_values.map_values(self, path.create_path(source_path), operator, dtype, new_field_name)
def slice(self, source_path, new_field_name, begin=None, end=None): """Creates a slice copy of source_path at new_field_path. Note that if begin or end is negative, it is considered relative to the size of the array. e.g., slice(...,begin=-1) will get the last element of every array. Args: source_path: the source of the slice. new_field_name: the new field that is generated. begin: the beginning of the slice (inclusive). end: the end of the slice (exclusive). Returns: An Expression object representing the result of the operation. """ return slice_expression.slice_expression(self, path.create_path(source_path), new_field_name, begin, end)
def testField(self, shape, fields, path_to_check=None, parent_indices=None, root_size=None, values=None): struct = _make_structured_tensor(shape, fields) prensor = structured_tensor_to_prensor.structured_tensor_to_prensor(struct) if root_size is not None: self.assertAllEqual(root_size, prensor.node.size) if path_to_check is not None: my_path = path.create_path(path_to_check) descendant = prensor.get_descendant(my_path) self.assertIsNotNone(descendant) my_node = descendant.node if parent_indices is not None: self.assertAllEqual(my_node.parent_index, parent_indices) if values is not None: self.assertAllEqual(my_node.values, values)
def create_has_field(self, source_path, new_field_name): """Creates a field that is the presence of the source path.""" return size.has(self, path.create_path(source_path), new_field_name)
def reroot(self, new_root): """Returns a new list of protocol buffers available at new_root.""" return reroot.reroot(self, path.create_path(new_root))
def create_size_field(self, source_path, new_field_name): """Creates a field that is the size of the source path.""" return size.size(self, path.create_path(source_path), new_field_name)
def promote_and_broadcast( self, path_dictionary, dest_path_parent): return promote_and_broadcast.promote_and_broadcast( self, {k: path.create_path(v) for k, v in path_dictionary.items()}, path.create_path(dest_path_parent))
def project(self, path_list): """Constrains the paths to those listed.""" return project.project(self, [path.create_path(x) for x in path_list])
def broadcast(self, source_path, sibling_field, new_field_name): """Broadcasts the existing field at source_path to the sibling_field.""" return broadcast_module.broadcast(self, path.create_path(source_path), sibling_field, new_field_name)
def promote(self, source_path, new_field_name): """Promotes source_path to be a field new_field_name in its grandparent.""" return promote.promote(self, path.create_path(source_path), new_field_name)
def create_transformed_field( expr: expression.Expression, source_path: path.CoercableToPath, dest_field: StrStep, transform_fn: TransformFn) -> expression.Expression: """Create an expression that transforms serialized proto tensors. The transform_fn argument should take the form: def transform_fn(parent_indices, values): ... return (transformed_parent_indices, transformed_values) Given: - parent_indices: an int64 vector of non-decreasing parent message indices. - values: a string vector of serialized protos having the same shape as `parent_indices`. `transform_fn` must return new parent indices and serialized values encoding the same proto message as the passed in `values`. These two vectors must have the same size, but it need not be the same as the input arguments. Args: expr: a source expression containing `source_path`. source_path: the path to the field to reverse. dest_field: the name of the newly created field. This field will be a sibling of the field identified by `source_path`. transform_fn: a callable that accepts parent_indices and serialized proto values and returns a posibly modified parent_indices and values. Returns: An expression. Raises: ValueError: if the source path is not a proto message field. """ source_path = path.create_path(source_path) source_expr = expr.get_descendant_or_error(source_path) if not isinstance(source_expr, _ProtoChildExpression): raise ValueError( "Expected _ProtoChildExpression for field {}, but found {}.". format(str(source_path), source_expr)) if isinstance(source_expr, _TransformProtoChildExpression): # In order to be able to propagate fields needed for parsing, the source # expression of _TransformProtoChildExpression must always be the original # _ProtoChildExpression before any transformation. This means that two # sequentially applied _TransformProtoChildExpression would have the same # source and would apply the transformation to the source directly, instead # of one transform operating on the output of the other. # To work around this, the user supplied transform function is wrapped to # first call the source's transform function. # The downside of this approach is that the initial transform may be # applied redundantly if there are other expressions derived directly # from it. def final_transform(parent_indices: tf.Tensor, values: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: parent_indices, values = source_expr.transform_fn( parent_indices, values) return transform_fn(parent_indices, values) else: final_transform = transform_fn transformed_expr = _TransformProtoChildExpression( parent=source_expr._parent, # pylint: disable=protected-access desc=source_expr._desc, # pylint: disable=protected-access is_repeated=source_expr.is_repeated, name_as_field=source_expr.name_as_field, transform_fn=final_transform) dest_path = source_path.get_parent().get_child(dest_field) return expression_add.add_paths(expr, {dest_path: transformed_expr})