예제 #1
0
 def test_coerce_infer_columns(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(table)
     self.assertEqual(
         result.columns,
         [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())],
     )
예제 #2
0
 def test_coerce_infer_columns_format_supercedes_try_fallback_columns(self):
     table = pd.DataFrame({"A": [1, 2]})
     result = ProcessResult.coerce(
         {"dataframe": table, "column_formats": {"A": "{:,d}"}},
         try_fallback_columns=[Column("A", ColumnType.NUMBER("{:,.2f}"))],
     )
     self.assertEqual(result.columns, [Column("A", ColumnType.NUMBER("{:,d}"))])
예제 #3
0
 def test_arrow_uint8_column(self):
     dataframe, columns = arrow_table_to_dataframe(
         arrow_table(
             {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())},
             columns=[atypes.Column("A", ColumnType.Number("{:,d}"))],
         ))
     assert_frame_equal(dataframe,
                        pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8))
     self.assertEqual(columns, [Column("A", ColumnType.Number("{:,d}"))])
예제 #4
0
 def test_arrow_all_null_text_column(self):
     dataframe, columns = arrow_table_to_dataframe(
         arrow_table(
             {"A": pyarrow.array(["a", "b", None, "c"])},
             columns=[atypes.Column("A", ColumnType.Text())],
         ))
     assert_frame_equal(dataframe,
                        pd.DataFrame({"A": ["a", "b", np.nan, "c"]}))
     self.assertEqual(columns, [Column("A", ColumnType.Text())])
예제 #5
0
 def test_coerce_infer_columns_with_format(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(
         {"dataframe": table, "column_formats": {"A": "{:,d}"}}
     )
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.NUMBER(format="{:,d}")),
             Column("B", ColumnType.TEXT()),
         ],
     )
예제 #6
0
 def test_dataframe_uint8_column(self):
     assert_arrow_table_equals(
         dataframe_to_arrow_table(
             pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8),
             [Column("A", ColumnType.Number("{:,d}"))],
             self.path,
         ),
         arrow_table(
             {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())},
             [atypes.Column("A", ColumnType.Number("{:,d}"))],
         ),
     )
예제 #7
0
 def test_coerce_infer_columns_try_fallback_columns(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(
         table,
         try_fallback_columns=[
             Column("A", ColumnType.Number("{:,d}")),
             Column("B", ColumnType.Text()),
         ],
     )
     self.assertEqual(
         result.columns,
         [Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text())],
     )
예제 #8
0
 def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(
         table,
         try_fallback_columns=[
             Column("A", ColumnType.TEXT()),
             Column("B", ColumnType.NUMBER()),
         ],
     )
     self.assertEqual(
         result.columns,
         [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())],
     )
예제 #9
0
 def test_coerce_infer_columns_with_unit(self):
     table = pd.DataFrame(
         {"A": [pd.Period("2021-01-01", freq="D"), None], "B": ["x", "y"]}
     )
     result = ProcessResult.coerce(
         {"dataframe": table, "column_formats": {"A": "year"}}
     )
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.Date(unit="year")),
             Column("B", ColumnType.Text()),
         ],
     )
예제 #10
0
 def test_ctor_infer_columns(self):
     result = ProcessResult(
         pd.DataFrame({
             "A": [1, 2],
             "B": ["x", "y"],
             "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)],
         }))
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.Number()),
             Column("B", ColumnType.Text()),
             Column("C", ColumnType.Timestamp()),
         ],
     )
예제 #11
0
 def test_to_arrow_normal_dataframe(self):
     fd, filename = tempfile.mkstemp()
     os.close(fd)
     # Remove the file. Then we'll test that ProcessResult.to_arrow() does
     # not write it (because the result is an error)
     os.unlink(filename)
     try:
         process_result = ProcessResult.coerce(pd.DataFrame({"A": [1, 2]}))
         result = process_result.to_arrow(Path(filename))
         self.assertEqual(
             result,
             atypes.RenderResult(
                 atypes.ArrowTable(
                     Path(filename),
                     pyarrow.table({"A": [1, 2]}),
                     atypes.TableMetadata(
                         2,
                         [
                             atypes.Column(
                                 "A",
                                 ColumnType.Number(
                                     # Whatever .format
                                     # ProcessResult.coerce() gave
                                     process_result.columns[0].type.format),
                             )
                         ],
                     ),
                 ),
                 [],
                 {},
             ),
         )
     finally:
         os.unlink(filename)
예제 #12
0
 def test_to_arrow(self):
     self.assertEqual(
         TableShape(
             3,
             [
                 Column("A", ColumnType.NUMBER("{:,d}")),
                 Column("B", ColumnType.TEXT()),
             ],
         ).to_arrow(),
         atypes.TableMetadata(
             3,
             [
                 atypes.Column("A", atypes.ColumnType.Number("{:,d}")),
                 atypes.Column("B", atypes.ColumnType.Text()),
             ],
         ),
     )
예제 #13
0
 def test_dataframe_uint8_column(self):
     self._test_dataframe_to_arrow_table(
         pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8),
         [Column("A", ColumnType.Number("{:,d}"))],
         make_table(
             make_column("A", [1, 2, 3, 253], type=pa.uint8(), format="{:,d}")
         ),
     )
예제 #14
0
 def test_format_whole_float_as_int(self):
     """
     Mimic d3-format, which cannot differentiate between float and int.
     """
     series = pd.Series([1.1, 2.0, 123456789.0])
     column_type = ColumnType.NUMBER("{:,}")
     result = column_type.format_series(series)
     assert_series_equal(result, pd.Series(["1.1", "2", "123,456,789"]))
예제 #15
0
 def test_arrow_timestamp_column(self):
     dataframe, columns = arrow_table_to_dataframe(
         arrow_table(
             {
                 "A":
                 pyarrow.array(
                     [dt.fromisoformat("2019-09-17T21:21:00.123456"), None],
                     type=pyarrow.timestamp(unit="ns", tz=None),
                 )
             },
             [atypes.Column("A", ColumnType.Timestamp())],
         ))
     assert_frame_equal(
         dataframe,
         pd.DataFrame({"A": ["2019-09-17T21:21:00.123456Z", None]},
                      dtype="datetime64[ns]"),
     )
     self.assertEqual(columns, [Column("A", ColumnType.Timestamp())])
예제 #16
0
 def test_columns(self):
     df = pd.DataFrame({
         "A": [1],  # number
         "B": ["foo"],  # str
         "C": dt(2018, 8, 20),  # datetime64
     })
     df["D"] = pd.Series(["cat"], dtype="category")
     result = ProcessResult(df)
     self.assertEqual(result.column_names, ["A", "B", "C", "D"])
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.Number()),
             Column("B", ColumnType.Text()),
             Column("C", ColumnType.Timestamp()),
             Column("D", ColumnType.Text()),
         ],
     )
예제 #17
0
 def test_dataframe_all_null_text_column(self):
     assert_arrow_table_equals(
         dataframe_to_arrow_table(
             pd.DataFrame({"A": [None]}, dtype=str),
             [Column("A", ColumnType.TEXT())],
             self.path,
         ),
         arrow_table({"A": pyarrow.array([None], pyarrow.string())}),
     )
예제 #18
0
 def test_ctor_infer_columns(self):
     result = ProcessResult(
         pd.DataFrame(
             {
                 "A": [1, 2],
                 "B": ["x", "y"],
                 "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)],
             }
         )
     )
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.NUMBER()),
             Column("B", ColumnType.TEXT()),
             Column("C", ColumnType.DATETIME()),
         ],
     )
예제 #19
0
 def test_dataframe_datetime_column(self):
     assert_arrow_table_equals(
         dataframe_to_arrow_table(
             pd.DataFrame({"A": ["2019-09-17T21:21:00.123456Z", None]},
                          dtype="datetime64[ns]"),
             [Column("A", ColumnType.Timestamp())],
             self.path,
         ),
         arrow_table(
             {
                 "A":
                 pyarrow.array(
                     [dt.fromisoformat("2019-09-17T21:21:00.123456"), None],
                     type=pyarrow.timestamp(unit="ns", tz=None),
                 )
             },
             [atypes.Column("A", ColumnType.Timestamp())],
         ),
     )
예제 #20
0
 def test_dataframe_datetime_column(self):
     self._test_dataframe_to_arrow_table(
         pd.DataFrame(
             {"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]"
         ),
         [Column("A", ColumnType.Timestamp())],
         make_table(
             make_column("A", [dt.fromisoformat("2019-09-17T21:21:00.123456"), None])
         ),
     )
예제 #21
0
 def test_ctor_infer_columns(self):
     result = ProcessResult(
         pd.DataFrame(
             {
                 "A": [1, 2],
                 "B": ["x", "y"],
                 "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)],
                 "D": [pd.Period("2021-01-01", freq="D"), pd.NaT],
             }
         )
     )
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.Number()),
             Column("B", ColumnType.Text()),
             Column("C", ColumnType.Timestamp()),
             Column("D", ColumnType.Date("day")),
         ],
     )
예제 #22
0
 def test_format(self):
     series = pd.Series(
         [dt(1999, 2, 3, 4, 5, 6, 7), np.nan, dt(2000, 3, 4, 5, 6, 7, 8)]
     )
     column_type = ColumnType.DATETIME()
     result = column_type.format_series(series)
     assert_series_equal(
         result,
         pd.Series(
             ["1999-02-03T04:05:06.000007Z", np.nan, "2000-03-04T05:06:07.000008Z"]
         ),
     )
예제 #23
0
 def test_arrow_category_column(self):
     atable = arrow_table({
         "A":
         pyarrow.DictionaryArray.from_arrays(
             pyarrow.array([0, 1, None, 0], type=pyarrow.int8()),
             pyarrow.array(["A", "B"], type=pyarrow.string()),
         )
     })
     dataframe, columns = arrow_table_to_dataframe(atable)
     self.assertEqual(columns, [Column("A", ColumnType.Text())])
     assert_frame_equal(
         dataframe,
         pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"))
예제 #24
0
 def test_dataframe_category_column(self):
     self._test_dataframe_to_arrow_table(
         pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"),
         [Column("A", ColumnType.Text())],
         pa.table(
             {
                 "A": pa.DictionaryArray.from_arrays(
                     pa.array([0, 1, None, 0], pa.int8()),
                     pa.array(["A", "B"], pa.string()),
                 ),
             }
         ),
     )
예제 #25
0
 def test_dataframe_all_null_category_column(self):
     self._test_dataframe_to_arrow_table(
         pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
         [Column("A", ColumnType.Text())],
         pa.table(
             {
                 "A": pa.DictionaryArray.from_arrays(
                     pa.array([None], pa.int8()),
                     pa.array([], pa.string()),
                 ),
             }
         ),
     )
예제 #26
0
 def test_arrow_all_null_category_column(self):
     atable = arrow_table(
         {
             "A": pyarrow.DictionaryArray.from_arrays(
                 pyarrow.array([None], type=pyarrow.int8()),
                 pyarrow.array([], type=pyarrow.string()),
             )
         }
     )
     dataframe, columns = arrow_table_to_dataframe(atable)
     self.assertEqual(columns, [Column("A", ColumnType.TEXT())])
     assert_frame_equal(
         dataframe, pd.DataFrame({"A": [None]}, dtype=str).astype("category")
     )
예제 #27
0
 def test_dataframe_category_column(self):
     assert_arrow_table_equals(
         dataframe_to_arrow_table(
             pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"),
             [Column("A", ColumnType.Text())],
             self.path,
         ),
         arrow_table({
             "A":
             pyarrow.DictionaryArray.from_arrays(
                 pyarrow.array([0, 1, None, 0], type=pyarrow.int8()),
                 pyarrow.array(["A", "B"], type=pyarrow.string()),
             )
         }),
     )
예제 #28
0
 def test_dataframe_all_null_category_column(self):
     assert_arrow_table_equals(
         dataframe_to_arrow_table(
             pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
             [Column("A", ColumnType.TEXT())],
             self.path,
         ),
         arrow_table(
             {
                 "A": pyarrow.DictionaryArray.from_arrays(
                     pyarrow.array([None], type=pyarrow.int8()),
                     pyarrow.array([], type=pyarrow.string()),
                 )
             }
         ),
     )
예제 #29
0
 def test_format_nulls_becomes_str(self):
     series = pd.Series([np.nan, np.nan], dtype=np.float64)
     result = ColumnType.NUMBER().format_series(series)
     assert_series_equal(result, pd.Series([np.nan, np.nan], dtype=object))
예제 #30
0
 def test_format_zero_length_becomes_str(self):
     # (even though there's no way for pandas to detect type of result)
     # (luckily, pandas defaults to `object`)
     series = pd.Series([], dtype=np.int64)
     result = ColumnType.NUMBER().format_series(series)
     assert_series_equal(result, pd.Series([], dtype=object))