def test_too_many_rows(self): with _temp_csv("A\na\nb\nc\nd\ne\nf\ng") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult(pa.table({"A": list("abcd")}), [ParseCsvWarning.SkippedRows(3, 5)]), )
def test_truncate_values(self): with _temp_csv("\n".join([ # Examples from https://en.wikipedia.org/wiki/UTF-8 "AAAAxxxx", "AAAA", "AA\u00A2", # ¢ (2 bytes) -- keep "AAA\u00A2", # ¢ (2 bytes) -- drop both bytes "A\u0939", # ह (3 bytes) -- keep "AA\u0939", # ह (3 bytes) -- drop all three bytes "AAA\u0939", # ह (3 bytes) -- drop all three bytes "\U00010348", # 𐍈 (4 bytes) -- keep "A\U00010348", # 𐍈 (4 bytes) -- drop all four bytes "AA\U00010348", # 𐍈 (4 bytes) -- drop all four bytes "AAA\U00010348", # 𐍈 (4 bytes) -- drop all four bytes ])) as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "AAAA": [ "AAAA", "AA\u00A2", "AAA", "A\u0939", "AA", "AAA", "\U00010348", "A", "AA", "AAA", ] }), [ParseCsvWarning.TruncatedValues(7, 4, 0, 0)], ), )
def test_truncate_column_names(self): with _temp_csv("ABC,ABCD,ABCDE,BCDEF\na,b,c,d") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "ABC": ["a"], "ABCD": ["b"], "AB 2": ["c"], "BCDE": ["d"] }), [ ParseCsvWarning.TruncatedColumnNames(2, "AB 2"), ParseCsvWarning.NumberedColumnNames(1, "AB 2"), ], ), )
def test_truncate_csv_repair_utf8(self): with _temp_csv("A,B\na,b\nc,d\né,f\ng,h") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": ["a", "c", "�"], "B": ["b", "d", None] }), [ ParseCsvWarning.TruncatedFile(20, 13), ParseCsvWarning.RepairedEncoding( encoding="utf-8", first_invalid_byte=195, first_invalid_byte_position=12, ), ], ), )
def test_omit_ascii_control_characters_from_column_names(self): with _temp_csv("A\tB,C\na\tb,c") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "AB": ["a\tb"], "C": ["c"] }), [ParseCsvWarning.CleanedAsciiColumnNames(1, "AB")], ), )
def test_truncate_csv(self): with _temp_csv("A,B\na,b\nc,d\ne,f\ng,h") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": ["a", "c", "e"], "B": ["b", "d", None] }), [ParseCsvWarning.TruncatedFile(19, 13)], ), )
def test_too_many_columns(self): with _temp_csv("A,B,C,D,E,F\na,b,c,d,e,f") as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": ["a"], "B": ["b"] }), [ParseCsvWarning.SkippedColumns(4, 2)], ), )
def test_repair_unexpected_eof(self): with _temp_csv('A,B\nx,"y\nz') as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": ["x"], "B": ["y\nz"] }), [ParseCsvWarning.RepairedEndOfFile()], ), )
def test_repair_missing_quote(self): with _temp_csv('A,B\n"x" y,"z""" a') as path: assert_csv_result_equals( _internal_parse_csv(path, has_header=True), ParseCsvResult( pa.table({ "A": ["x y"], "B": ['z" a'] }), [ParseCsvWarning.RepairedValues(2, 1, 0)], ), )
def test_rewrite_conflicting_column_headers(self): with _temp_csv("A,A,Column 4,\na,b,c,d") as path: result = _internal_parse_csv(path, has_header=True) assert_csv_result_equals( result, ParseCsvResult( pa.table({ "A": ["a"], "A 2": ["b"], # rewritten "Column 4": ["c"], "Column 5": ["d"], # rewritten }), [ParseCsvWarning.NumberedColumnNames(2, "A 2")], ), )
def test_warn_and_replace_on_invalid_encoding(self): # tests that `chardet` is invoked with _temp_csv("A\nfôo\ncafé".encode("windows-1252")) as path: assert_csv_result_equals( _internal_parse_csv(path, encoding="utf-8", has_header=True), ParseCsvResult( pa.table({"A": ["f�o", "caf�"]}), [ ParseCsvWarning.RepairedEncoding( encoding="utf-8", first_invalid_byte=244, first_invalid_byte_position=3, ) ], ), )