def test_fast_csv_reader_column_vals_full(self): def _make_column_val_full_data(chunk_row_size): columns = {} count_row = chunk_row_size columns['f1'] = ['a' for _ in range(count_row // 2)] + [ 'b' * 1000 for _ in range(count_row // 2, count_row) ] df = pd.DataFrame(columns) column_offsets = np.array([0, 10], dtype=np.int64) * chunk_row_size csv_buffer = StringIO() df.to_csv(csv_buffer, index=False) return csv_buffer, df, column_offsets chunk_row_size = 10 csv_buffer, df, column_offsets = _make_column_val_full_data( chunk_row_size) content = np.frombuffer(csv_buffer.getvalue().encode(), dtype=np.uint8) _, count_columns, count_rows, _ = get_file_stat( csv_buffer, chunk_row_size) column_inds = np.zeros((count_columns, count_rows + 1), dtype=np.int64) column_vals = np.zeros(column_offsets[-1], dtype=np.uint8) _, _, is_indices_full, is_values_full, val_full_col_idx = fast_csv_reader( content, 0, column_inds, column_vals, column_offsets, True, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE) self.assertFalse(is_indices_full) self.assertTrue(is_values_full) self.assertEqual(val_full_col_idx, 0)
def test_csv_fast_correctness(self): file_lines, chunk_row_size = 3, 100 csv_buffer, df, _, _, column_offsets = _make_test_data( TEST_SCHEMA, file_lines, chunk_row_size) content = np.frombuffer(csv_buffer.getvalue().encode(), dtype=np.uint8) _, count_columns, count_rows, _ = get_file_stat( csv_buffer, chunk_row_size) column_inds = np.zeros((count_columns, count_rows + 1), dtype=np.int64) column_vals = np.zeros(column_offsets[-1], dtype=np.uint8) _, written_row_count, _, _, _ = fast_csv_reader( content, 0, column_inds, column_vals, column_offsets, True, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE) self.assertEqual(written_row_count, 3) self.assertListEqual(list(column_inds[0][:written_row_count + 1]), [0, 3, 5, 9]) self.assertListEqual(list(column_inds[1][:written_row_count + 1]), [0, 1, 2, 3]) self.assertListEqual( list(column_vals[column_offsets[0]:column_offsets[0] + 9]), [99, 99, 99, 98, 98, 100, 100, 100, 100]) self.assertListEqual( list(column_vals[column_offsets[1]:column_offsets[1] + 3]), [49, 48, 49])
def test_escape_bad_formed_csv_2(self): open_csv = StringIO('id,f1\n1,"abc"de\n') content = np.frombuffer(open_csv.getvalue().encode(), dtype=np.uint8) chunk_row_size = 10 _, count_columns, count_rows, _ = get_file_stat( open_csv, chunk_row_size) column_offsets = np.array([0, 1, 11], dtype=np.int64) * chunk_row_size column_inds = np.zeros( (count_columns, count_rows + 1), dtype=np.int64) # add one more row for initial index 0 column_vals = np.zeros(column_offsets[-1], dtype=np.uint8) with self.assertRaises(Exception) as context: fast_csv_reader(content, 0, column_inds, column_vals, column_offsets, True, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE) self.assertEqual(str(context.exception), 'invalid double quote')
def test_escape_well_formed_csv(self): open_csv = StringIO('id,f1,f2,f3\n1,"abc","a""b""c","""ab"""\n') content = np.frombuffer(open_csv.getvalue().encode(), dtype=np.uint8) chunk_row_size = 10 _, count_columns, count_rows, _ = get_file_stat( open_csv, chunk_row_size) column_offsets = np.array([0, 1, 11, 21, 31], dtype=np.int64) * chunk_row_size column_inds = np.zeros( (count_columns, count_rows + 1), dtype=np.int64) # add one more row for initial index 0 column_vals = np.zeros(column_offsets[-1], dtype=np.uint8) _, written_row_count, _, _, _ = fast_csv_reader( content, 0, column_inds, column_vals, column_offsets, True, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE) self.assertEqual(written_row_count, 1) self.assertEqual(column_inds[1, 1], 3) # abc self.assertEqual(column_inds[2, 1], 5) # a"b"c self.assertEqual(column_inds[3, 1], 4) # "ab"
def test_fast_csv_reader_column_inds_full(self): def _make_column_inds_full_data(chunk_row_size): columns = {} count_row = chunk_row_size count_col = 3 column_offsets = np.zeros(count_col + 1, dtype=np.int64) for i in range(count_col): fieldname = 'f' + str(i) columns[fieldname] = ['abcd'] + [''] * (count_row - 1) column_offsets[i + 1] = column_offsets[i] + 10 * chunk_row_size df = pd.DataFrame(columns) csv_buffer = StringIO() df.to_csv(csv_buffer, index=False) return csv_buffer, df, column_offsets chunk_row_size = 10 csv_buffer, df, column_offsets = _make_column_inds_full_data( chunk_row_size) content = np.frombuffer(csv_buffer.getvalue().encode(), dtype=np.uint8) _, count_columns, count_rows, _ = get_file_stat( csv_buffer, chunk_row_size) column_inds = np.zeros((count_columns, count_rows + 1), dtype=np.int64) column_vals = np.zeros(column_offsets[-1], dtype=np.uint8) _, _, is_indices_full, is_values_full, _ = fast_csv_reader( content, 0, column_inds, column_vals, column_offsets, True, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE) self.assertTrue(is_indices_full) self.assertFalse(is_values_full)