def test_liveness(self): not_managed = create_table() with liveness_scope() as l_scope: to_discard = create_table() df = to_pandas(to_discard) must_keep = create_table() df = to_pandas(must_keep) l_scope.preserve(must_keep) self.assertTrue(not_managed.j_table.tryRetainReference()) self.assertTrue(must_keep.j_table.tryRetainReference()) self.assertFalse(to_discard.j_table.tryRetainReference())
def test_to_pandas(self): df = to_pandas(self.test_table) self.assertEqual(len(df.columns), len(self.test_table.columns)) self.assertEqual(df.size, 2 * len(self.test_table.columns)) df_series = [df[col] for col in list(df.columns)] for i, col in enumerate(self.test_table.columns): with self.subTest(col): self.assertEqual(col.data_type.np_type, df_series[i].dtype) self.assertEqual(col.name, df_series[i].name)
def test_to_table_boolean_with_none(self): input_cols = [bool_col(name="Boolean", data=[True, None])] table_with_null_bool = new_table(cols=input_cols) prepared_table = table_with_null_bool.update(formulas=[ "Boolean = isNull(Boolean) ? NULL_BYTE : (Boolean == true ? 1: 0)" ]) df = to_pandas(prepared_table) table_from_df = to_table(df) self.assert_table_equals(table_from_df, prepared_table)
def test_to_pandas_remaps(self): prepared_table = self.test_table.update( formulas=["Long = isNull(Long_) ? Double.NaN : Long_"]) df = to_pandas(prepared_table, cols=["Boolean", "Long"]) self.assertEqual(df['Long'].dtype, np.float64) self.assertEqual(df['Boolean'].values.dtype, np.bool_) df1 = pd.DataFrame([[1, float('Nan')], [True, False]]) df1.equals(df)
def test_liveness_nested(self): with liveness_scope() as l_scope: to_discard = create_table() df = to_pandas(to_discard) must_keep = create_table() df = to_pandas(must_keep) l_scope.preserve(must_keep) with liveness_scope() as nested_l_scope: nested_to_discard = create_table() df = to_pandas(nested_to_discard) nested_must_keep = create_table() df = to_pandas(nested_must_keep) nested_l_scope.preserve(nested_must_keep) self.assertTrue(nested_must_keep.j_table.tryRetainReference()) # drop the extra reference obtained by the tryRetainReference() call in the above assert nested_must_keep.j_table.dropReference() self.assertFalse(nested_to_discard.j_table.tryRetainReference()) self.assertTrue(must_keep.j_table.tryRetainReference()) self.assertFalse(to_discard.j_table.tryRetainReference()) self.assertFalse(nested_must_keep.j_table.tryRetainReference()) self.assertFalse(nested_to_discard.j_table.tryRetainReference())
def test_vector_column(self): strings = ["Str1", "Str1", "Str2", "Str2", "Str2"] doubles = [1.0, 2.0, 4.0, 8.0, 16.0] test_table = new_table( [string_col("String", strings), double_col("Doubles", doubles)]) test_table = test_table.group_by(["String"]) df = to_pandas(test_table, cols=["String", "Doubles"]) self.assertEqual(df['String'].dtype, np.object_) self.assertEqual(df['Doubles'].dtype, np.object_) double_series = df['Doubles'] self.assertEqual([1.0, 2.0], list(double_series[0].toArray())) self.assertEqual([4.0, 8.0, 16.0], list(double_series[1].toArray()))
def test_to_table(self): input_cols = [ bool_col(name="Boolean", data=[True, False]), byte_col(name="Byte", data=(1, -1)), char_col(name="Char", data='-1'), short_col(name="Short", data=[1, -1]), int_col(name="Int", data=[1, -1]), long_col(name="Long", data=[1, NULL_LONG]), long_col(name="NPLong", data=np.array([1, -1], dtype=np.int8)), float_col(name="Float", data=[1.01, -1.01]), double_col(name="Double", data=[1.01, -1.01]), ] test_table = new_table(cols=input_cols) df = to_pandas(test_table) table_from_df = to_table(df) self.assert_table_equals(table_from_df, test_table)
def test_to_table_datetime_with_none(self): datetime_str = "2021-12-10T23:59:59 NY" dt = to_datetime(datetime_str) datetime_str = "2021-12-10T23:59:59 HI" dt1 = to_datetime(datetime_str) input_cols = [ datetime_col(name="Datetime", data=[dtypes.DateTime(1), None, dt, dt1]) ] table_with_null_dt = new_table(cols=input_cols) df = to_pandas(table_with_null_dt) table_from_df = to_table(df) self.assert_table_equals(table_from_df, table_with_null_dt)
def base_test(self, source, model, np_dtype): rows = source.j_table.getRowSet() cols = [source.j_table.getColumnSource(col) for col in ["X", "Y", "Z"]] gatherer_rowmajor = lambda rowset, colset: gather.table_to_numpy_2d( rowset, colset, gather.MemoryLayout.ROW_MAJOR, np_dtype) gatherer_colmajor = lambda rowset, colset: gather.table_to_numpy_2d( rowset, colset, gather.MemoryLayout.COLUMN_MAJOR, np_dtype) array_from_table = to_pandas(source).values gathered_rowmajor = gatherer_rowmajor(rows, cols) gathered_colmajor = gatherer_colmajor(rows, cols) with self.subTest(msg="Array shape"): self.assertTrue(gathered_rowmajor.shape == array_from_table.shape) print("Row major gathered shape: {}".format( gathered_rowmajor.shape)) self.assertTrue(gathered_colmajor.shape == array_from_table.shape) print("Column major gathered shape: {}".format( gathered_colmajor.shape)) with self.subTest(msg="Values in array"): self.assertTrue(np.allclose(gathered_rowmajor, array_from_table)) print("All row-major array values are equal") self.assertTrue(np.allclose(gathered_colmajor, array_from_table)) print("All column-major array values are equal") with self.subTest(msg="Array data type"): self.assertTrue(gathered_rowmajor.dtype == np_dtype) self.assertTrue(gathered_rowmajor.dtype == array_from_table.dtype) self.assertTrue(gathered_colmajor.dtype == np_dtype) self.assertTrue(gathered_colmajor.dtype == array_from_table.dtype) self.assertTrue(gathered_rowmajor.dtype == gathered_colmajor.dtype) print("Array dtype: {}".format(np_dtype)) with self.subTest(msg="Contiguity"): self.assertTrue(gathered_rowmajor.flags["C_CONTIGUOUS"] or gathered_rowmajor.flags["F_CONTIGUOUS"]) self.assertTrue(gathered_colmajor.flags["C_CONTIGUOUS"] or gathered_colmajor.flags["F_CONTIGUOUS"]) print("Array contiguity checked")
def test_round_trip_with_nulls(self): # Note that no two-way conversion for those types # j_array_list = dtypes.ArrayList([1, -1]) # bool_col(name="Boolean", data=[True, None])] # string_col(name="String", data=["foo", None]), # jobj_col(name="JObj", data=[j_array_list, None]), input_cols = [ byte_col(name="Byte", data=(1, NULL_BYTE)), char_col(name="Char", data='-1'), short_col(name="Short", data=[1, NULL_SHORT]), int_col(name="Int_", data=[1, NULL_INT]), long_col(name="Long_", data=[1, NULL_LONG]), float_col(name="Float_", data=[1.01, np.nan]), double_col(name="Double_", data=[1.01, np.nan]), datetime_col(name="Datetime", data=[dtypes.DateTime(1), None]), pyobj_col(name="PyObj", data=[CustomClass(1, "1"), None]), ] test_table = new_table(cols=input_cols) df = to_pandas(test_table) self.assertEqual(len(df.columns), len(test_table.columns)) self.assertEqual(df.size, 2 * len(test_table.columns)) test_table2 = to_table(df) self.assert_table_equals(test_table2, test_table)
def test_invalid_col_name(self): with self.assertRaises(DHError) as cm: to_pandas(self.test_table, cols=["boolean", "Long"]) self.assertIn("boolean", str(cm.exception))
def test_to_table_category(self): df = pd.DataFrame({"A": ["a", "b", "a", "d"]}) df["B"] = df["A"].astype("category") table = to_table(df) df2 = to_pandas(table) self.assertTrue(np.array_equal(df2["A"].values, df2["B"].values))