Example #1
0
 def test_text_zero_chunks_valid(self):
     validate(
         pyarrow.Table.from_batches([],
                                    pyarrow.schema([("A", pyarrow.string())
                                                    ])),
         TableMetadata(0, [Text("A")]),
     )
Example #2
0
 def test_duplicate_column_name(self):
     with self.assertRaises(DuplicateColumnName):
         validate(
             pyarrow.Table.from_arrays(
                 [pyarrow.array(["a"]),
                  pyarrow.array(["b"])], ["A", "A"]),
             TableMetadata(1, [Text("A"), Text("A")]),
         )
Example #3
0
 def test_column_name_mismatch(self):
     with self.assertRaises(WrongColumnName):
         validate(
             pyarrow.table({
                 "A": ["a"],
                 "B": ["b"]
             }),
             TableMetadata(1, [Text("A"), Text("B2")]),
         )
Example #4
0
    def test_column_name_not_utf8(self):
        bad_str = "\ud800x"  # invalid continuation byte
        bad_bytes = b"\xed\xa0\x80x"  # bad_str, encoded to utf-8

        with self.assertRaises(ColumnNameIsInvalidUtf8):
            validate(
                pyarrow.Table.from_arrays([pyarrow.array(["a"])], [bad_bytes]),
                TableMetadata(1, [Text(bad_str)]),
            )
Example #5
0
 def test_dictionary_column_has_unused_entry(self):
     with self.assertRaises(DictionaryColumnHasUnusedEntry):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array(["x", None, "y", "y",
                                "z"]).dictionary_encode()[0:4]
             }),
             TableMetadata(4, [Text("A")]),
         )
Example #6
0
 def test_text_dictionary_zero_chunks_is_valid(self):
     validate(
         pyarrow.Table.from_batches(
             [],
             pyarrow.schema([("A",
                              pyarrow.dictionary(pyarrow.int32(),
                                                 pyarrow.string()))]),
         ),
         TableMetadata(0, [Text("A")]),
     )
Example #7
0
 def test_table_not_one_batch(self):
     with self.assertRaises(TableHasTooManyRecordBatches):
         validate(
             pyarrow.Table.from_batches([
                 pyarrow.RecordBatch.from_arrays([pyarrow.array(["a"])],
                                                 ["A"]),
                 pyarrow.RecordBatch.from_arrays([pyarrow.array(["b"])],
                                                 ["A"]),
             ]),
             TableMetadata(2, [Text("A")]),
         )
Example #8
0
 def test_column_datetime_must_be_ns_resolution(self):
     # [2019-09-17] Pandas only supports datetime64[ns]
     # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
     with self.assertRaises(DatetimeUnitNotAllowed):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array([5298375234],
                               type=pyarrow.timestamp("us", tz=None))
             }),
             TableMetadata(1, [Datetime("A")]),
         )
Example #9
0
 def test_column_datetime_should_be_tz_naive(self):
     with self.assertRaises(DatetimeTimezoneNotAllowed):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.array(
                     [5298375234123],
                     type=pyarrow.timestamp("ns", "America/New_York"),
                 )
             }),
             TableMetadata(1, [Datetime("A")]),
         )
Example #10
0
 def test_text_invalid_utf8(self):
     # Let's construct a particularly tricky case: two strings that are
     # invalid on their own but are valid when concatenated. (In the buffer
     # they're concatenated, so the buffer bytes are valid utf-8 even though
     # the values aren't.)
     #
     # We'll also throw in a NULL, so we don't get tempted to ignore them
     # when we optimize this validation.
     poop_bytes = "💩".encode("utf-8")
     binary_array = pyarrow.array([None, poop_bytes[:2], poop_bytes[2:]])
     _, offsets, data = binary_array.buffers()
     with self.assertRaises(ColumnDataIsInvalidUtf8):
         validate(
             pyarrow.table(
                 {"A": pyarrow.StringArray.from_buffers(3, offsets, data)}),
             TableMetadata(3, [Text("A")]),
         )
Example #11
0
    def render(
        self,
        compiled_module: CompiledModule,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        request = ttypes.RenderRequest(
            str(basedir),
            input_table.to_thrift(),
            params.to_thrift(),
            tab.to_thrift(),
            None if fetch_result is None else fetch_result.to_thrift(),
            output_filename,
        )
        with _chroot_dir_context(provide_paths=[basedir],
                                 extract_paths=[basedir / output_filename
                                                ]) as chroot:
            result = self._run_in_child(
                chroot=chroot,
                chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
                NETWORKING_PATHS,  # TODO nix networking
                compiled_module=compiled_module,
                timeout=self.render_timeout,
                result=ttypes.RenderResult(),
                function="render_thrift",
                args=[request],
            )
            if result.table.filename and result.table.filename != output_filename:
                raise ModuleExitedError(0, "Module wrote to wrong output file")

        # RenderResult.from_thrift() verifies all filenames passed by the
        # module are in the directory the module has access to.
        render_result = RenderResult.from_thrift(result, basedir)
        if render_result.table.table is not None:
            validate(render_result.table.table, render_result.table.metadata)
        return render_result
Example #12
0
 def test_text_invalid_utf8_dictionary(self):
     # Let's construct a particularly tricky case: two strings that are
     # invalid on their own but are valid when concatenated. (In the buffer
     # they're concatenated, so the buffer bytes are valid utf-8 even though
     # the values aren't.)
     #
     # We can't _create_ a pyarrow.Array by passing `pyarrow.array()` bad
     # UTF-8, because `pyarrow.array()` actually encodes UTF-8 to binary.
     # But we _can_ create a table with invalid UTF-8 binary, by writing
     # buffers directly.
     poop_bytes = "💩".encode("utf-8")
     binary_array = pyarrow.array([poop_bytes[:2], poop_bytes[2:]])
     _, offsets, data = binary_array.buffers()
     with self.assertRaises(ColumnDataIsInvalidUtf8):
         validate(
             pyarrow.table({
                 "A":
                 pyarrow.DictionaryArray.from_arrays(
                     pyarrow.array([0, 1]),
                     pyarrow.StringArray.from_buffers(2, offsets, data),
                 )
             }),
             TableMetadata(2, [Text("A")]),
         )
Example #13
0
 def test_table_none_when_should_be_set(self):
     with self.assertRaises(WrongColumnCount):
         validate(None, TableMetadata(2, [Text("A")]))
Example #14
0
 def test_column_str_should_be_datetime(self):
     with self.assertRaises(WrongColumnType):
         validate(pyarrow.table({"A": ["x"]}),
                  TableMetadata(1, [Datetime("A")]))
Example #15
0
 def test_column_int_should_be_text(self):
     with self.assertRaises(WrongColumnType):
         validate(pyarrow.table({"A": [1]}), TableMetadata(1, [Text("A")]))
Example #16
0
 def test_empty_dictionary_is_valid(self):
     validate(
         pyarrow.table({"A":
                        pyarrow.array(["x"]).dictionary_encode()[0:0]}),
         TableMetadata(0, [Text("A")]),
     )
Example #17
0
 def test_happy_path_table_is_none(self):
     validate(None, TableMetadata(2, []))
Example #18
0
 def test_table_not_none_when_should_be_none(self):
     with self.assertRaises(TableShouldBeNone):
         validate(pyarrow.Table.from_arrays([]), TableMetadata(2, []))
Example #19
0
 def test_table_wrong_number_of_rows(self):
     with self.assertRaises(WrongRowCount):
         validate(pyarrow.Table.from_pydict({"A": ["x"]}),
                  TableMetadata(2, [Text("A")]))