Ejemplo n.º 1
0
    def test_spark_io(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_spark_io(tmp,
                                 format='json',
                                 mode='overwrite',
                                 partition_cols='i32')
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_spark_io(tmp,
                                      format='json')[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by='f').to_spark().toPandas(),
                expected.sort_values(by='f').to_spark().toPandas())

            # Write out partitioned by two columns
            expected.to_spark_io(tmp,
                                 format='json',
                                 mode='overwrite',
                                 partition_cols=['i32', 'bhello'])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_spark_io(path=tmp,
                                      format='json')[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by='f').to_spark().toPandas(),
                expected.sort_values(by='f').to_spark().toPandas())
Ejemplo n.º 2
0
    def test_spark_io(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_spark_io(tmp,
                                 format="json",
                                 mode="overwrite",
                                 partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_spark_io(tmp, format="json")
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_spark_io(tmp,
                                 format="json",
                                 mode="overwrite",
                                 partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_spark_io(path=tmp, format="json")
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # When index columns are known
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)
            col_order = ["f", "i32", "i64"]

            expected_idx = expected.set_index("bhello")[col_order]
            actual_idx = ks.read_spark_io(tmp,
                                          format="json",
                                          index_col="bhello")[col_order]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )