def do_hell_frame(self, dos, frame_encoding, out_encoding, include_null, allow_quoted_newlines, rows, cols, num_threads): expected = paratext.testing.generate_hell_frame(rows, cols, include_null=include_null, fmt=frame_encoding) with generate_tempfilename() as fn: logging.debug("filename: %s" % fn) paratext.serial.save_frame(fn, expected, allow_quoted_newlines, out_encoding=out_encoding, dos=dos) actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=allow_quoted_newlines, out_encoding=out_encoding, num_threads=num_threads, convert_null_to_space=not include_null) assert_dictframe_almost_equal(actual, expected)
def test_basic_empty_cells_num(self): filedata = b"""A,B,C,D,E,F #,1,#,#,2,# 3,#,#,4,5,# 6,#,#,#,#,# #,7,#,#,#,# #,#,8,#,#,# #,#,#,9,#,# #,#,#,#,10,# #,#,#,#,#,11 #,#,12,#,#,13 14,#,#,15,16,17 """ filedata = filedata.replace(b"#", b"") with generate_tempfile(filedata) as fn: expected = { "A": [0, 3, 6, 0, 0, 0, 0, 0, 0, 14], "B": [1, 0, 0, 7, 0, 0, 0, 0, 0, 0], "C": [0, 0, 0, 0, 8, 0, 0, 0, 12, 0], "D": [0, 4, 0, 0, 0, 9, 0, 0, 0, 15], "E": [2, 5, 0, 0, 0, 0, 10, 0, 0, 16], "F": [0, 0, 0, 0, 0, 0, 0, 11, 13, 17] } logging.debug("filename: %s" % fn) actual = paratext.load_csv_to_pandas(fn, number_only=True) assert_dictframe_almost_equal(actual, expected)
def do_basic_nums(self, dtype, num_rows, num_columns, num_threads, number_only, no_header): if no_header: filedata = '' keys = ["col%d" % k for k in range(num_columns)] else: keys = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] keys = keys[0:num_columns] filedata = ','.join(keys[0:num_columns]) + "\n" expected = {} for key in keys: expected[key] = [] for row in range(num_rows): if np.issubdtype(dtype, np.integer): row_data = [row * i for i in range(num_columns)] else: row_data = np.random.random((num_columns, )) filedata += ",".join([str(v) for v in row_data]) + "\n" for k in range(len(keys)): expected[keys[k]].append(row_data[k]) with generate_tempfile(filedata.encode("utf-8")) as fn: logging.debug("filename: %s" % fn) actual = paratext.load_csv_to_pandas(fn, num_threads=num_threads, number_only=number_only, no_header=no_header) assert_dictframe_almost_equal(actual, expected)
def test_basic_3x0x(self): filedata = b"""A,B,C """ with generate_tempfile(filedata) as fn: expected = {"A": [], "B": [], "C": []} logging.debug("filename: %s" % fn) actual = paratext.load_csv_to_pandas(fn) assert_dictframe_almost_equal(actual, expected)
def do_simple_file_suite(self, dataset_metadata, frame_encoding, num_threads, expected_df): filename = dataset_metadata["filename"] actual_df = paratext.load_csv_to_pandas(filename, allow_quoted_newlines=True, out_encoding=frame_encoding, num_threads=num_threads) assert_dictframe_almost_equal(actual_df, expected_df)
def test_edge_case1(self): filedata = b"""A,B A.1,3ABC """ with generate_tempfile(filedata) as fn: expected = {"A": ["A.1"], "B": ["3ABC"]} logging.debug("filename: %s" % fn) actual = paratext.load_csv_to_pandas(fn) assert_dictframe_almost_equal(actual, expected)
def test_basic_strange1(self): filedata = b"""A,B,C "\\\"","",7 "\\\\","X",8 "\n","\\\\\\"",9""" with generate_tempfile(filedata) as fn: expected = {"A": ["\"","\\","\n"], "B": ["","X","\\\""], "C": [7,8,9]} logging.debug("filename: %s" % fn) actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding="utf-8") assert_dictframe_almost_equal(actual, expected)
def run_case(self, num_rows, num_cats, num_floats, num_ints, num_threads): expected, types_df = paratext.testing.generate_mixed_frame(num_rows, num_floats, num_cats, num_ints) with generate_tempfilename() as fn: logging.debug("filename: %s" % fn) paratext.serial.save_frame(fn, expected, allow_quoted_newlines=True, out_encoding='utf-8') actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding='utf-8', num_threads=num_threads) if num_rows == 0: # Not fair to expect us to guess this correctly with no data. In particular we'll probably guess uint8 for the numeric cols. for col in actual: actual[col] = actual[col].astype(expected[col].dtype) assert_dictframe_almost_equal(actual, expected)
def run_case(self, num_rows, num_cats, num_floats, num_ints, num_threads): expected, types_df = paratext.testing.generate_mixed_frame( num_rows, num_floats, num_cats, num_ints) with generate_tempfilename() as fn: logging.debug("filename: %s" % fn) paratext.serial.save_frame(fn, expected, allow_quoted_newlines=True, out_encoding='utf-8') actual = paratext.load_csv_to_pandas(fn, allow_quoted_newlines=True, out_encoding='utf-8', num_threads=num_threads) assert_dictframe_almost_equal(actual, expected)
def do_basic_empty(self, file_body, num_threads): with generate_tempfile(file_body) as fn: logging.debug("filename: %s" % fn) actual = paratext.load_csv_to_pandas(fn, num_threads=num_threads) expected = pandas.DataFrame() assert_dictframe_almost_equal(actual, expected)