def test_integer_with_nulls(self): # pandas requires upcast to float dtype path = random_path() self.test_files.append(path) int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] num_values = 100 writer = FeatherWriter() writer.open(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 expected_cols = [] for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) writer.write_array(name, values, null_mask) expected = values.astype('f8') expected[null_mask] = np.nan expected_cols.append(expected) ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), columns=int_dtypes) writer.close() result = read_feather(path) assert_frame_equal(result, ex_frame)
def test_float_nulls(self): num_values = 100 path = random_path() self.test_files.append(path) writer = FeatherWriter() writer.open(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = ['f4', 'f8'] expected_cols = [] null_counts = [] for name in dtypes: values = np.random.randn(num_values).astype(name) writer.write_array(name, values, null_mask) values[null_mask] = np.nan expected_cols.append(values) null_counts.append(null_mask.sum()) writer.close() ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), columns=dtypes) result = read_feather(path) assert_frame_equal(result, ex_frame) assert_array_equal(self._get_null_counts(path), null_counts)
def test_buffer_bounds_error(self): # ARROW-1676 path = random_path() self.test_files.append(path) for i in range(16, 256): values = pa.array([None] + list(range(i)), type=pa.float64()) writer = FeatherWriter() writer.open(path) writer.write_array('arr', values) writer.close() result = read_feather(path) expected = pd.DataFrame({'arr': values.to_pandas()}) assert_frame_equal(result, expected) self._check_pandas_roundtrip(expected, null_counts=[1])
def test_read_table(self): num_values = (100, 100) path = random_path() self.test_files.append(path) writer = FeatherWriter() writer.open(path) values = np.random.randint(0, 100, size=num_values) for i in range(100): writer.write_array('col_' + str(i), values[:, i]) writer.close() data = pd.DataFrame(values, columns=['col_' + str(i) for i in range(100)]) table = pa.Table.from_pandas(data) result = read_table(path) assert_frame_equal(table.to_pandas(), result.to_pandas())
def test_dataset(self): num_values = (100, 100) num_files = 5 paths = [random_path() for i in range(num_files)] df = pd.DataFrame( np.random.randn(*num_values), columns=['col_' + str(i) for i in range(num_values[1])]) self.test_files.extend(paths) for index, path in enumerate(paths): rows = (index * (num_values[0] // num_files), (index + 1) * (num_values[0] // num_files)) writer = FeatherWriter() writer.open(path) for col in range(num_values[1]): writer.write_array(df.columns[col], df.iloc[rows[0]:rows[1], col]) writer.close() data = FeatherDataset(paths).read_pandas() assert_frame_equal(data, df)
def test_dataset(self): num_values = (100, 100) num_files = 5 paths = [random_path() for i in range(num_files)] df = pd.DataFrame(np.random.randn(*num_values), columns=['col_' + str(i) for i in range(num_values[1])]) self.test_files.extend(paths) for index, path in enumerate(paths): rows = (index * (num_values[0] // num_files), (index + 1) * (num_values[0] // num_files)) writer = FeatherWriter() writer.open(path) for col in range(num_values[1]): writer.write_array(df.columns[col], df.iloc[rows[0]:rows[1], col]) writer.close() data = FeatherDataset(paths).read_pandas() assert_frame_equal(data, df)
def test_boolean_nulls(self): # pandas requires upcast to object dtype path = random_path() self.test_files.append(path) num_values = 100 np.random.seed(0) writer = FeatherWriter() writer.open(path) mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 writer.write_array('bools', values, mask) expected = values.astype(object) expected[mask] = None writer.close() ex_frame = pd.DataFrame({'bools': expected}) result = read_feather(path) assert_frame_equal(result, ex_frame)