コード例 #1
0
 def test_too_many_rows(self):
     with _temp_csv("A\na\nb\nc\nd\ne\nf\ng") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(pa.table({"A": list("abcd")}),
                            [ParseCsvWarning.SkippedRows(3, 5)]),
         )
コード例 #2
0
 def test_truncate_values(self):
     with _temp_csv("\n".join([
             # Examples from https://en.wikipedia.org/wiki/UTF-8
             "AAAAxxxx",
             "AAAA",
             "AA\u00A2",  # ¢ (2 bytes) -- keep
             "AAA\u00A2",  # ¢ (2 bytes) -- drop both bytes
             "A\u0939",  # ह (3 bytes) -- keep
             "AA\u0939",  # ह (3 bytes) -- drop all three bytes
             "AAA\u0939",  # ह (3 bytes) -- drop all three bytes
             "\U00010348",  # 𐍈 (4 bytes) -- keep
             "A\U00010348",  # 𐍈 (4 bytes) -- drop all four bytes
             "AA\U00010348",  # 𐍈 (4 bytes) -- drop all four bytes
             "AAA\U00010348",  # 𐍈 (4 bytes) -- drop all four bytes
     ])) as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "AAAA": [
                         "AAAA",
                         "AA\u00A2",
                         "AAA",
                         "A\u0939",
                         "AA",
                         "AAA",
                         "\U00010348",
                         "A",
                         "AA",
                         "AAA",
                     ]
                 }),
                 [ParseCsvWarning.TruncatedValues(7, 4, 0, 0)],
             ),
         )
コード例 #3
0
 def test_truncate_column_names(self):
     with _temp_csv("ABC,ABCD,ABCDE,BCDEF\na,b,c,d") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "ABC": ["a"],
                     "ABCD": ["b"],
                     "AB 2": ["c"],
                     "BCDE": ["d"]
                 }),
                 [
                     ParseCsvWarning.TruncatedColumnNames(2, "AB 2"),
                     ParseCsvWarning.NumberedColumnNames(1, "AB 2"),
                 ],
             ),
         )
コード例 #4
0
 def test_truncate_csv_repair_utf8(self):
     with _temp_csv("A,B\na,b\nc,d\né,f\ng,h") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": ["a", "c", "�"],
                     "B": ["b", "d", None]
                 }),
                 [
                     ParseCsvWarning.TruncatedFile(20, 13),
                     ParseCsvWarning.RepairedEncoding(
                         encoding="utf-8",
                         first_invalid_byte=195,
                         first_invalid_byte_position=12,
                     ),
                 ],
             ),
         )
コード例 #5
0
 def test_omit_ascii_control_characters_from_column_names(self):
     with _temp_csv("A\tB,C\na\tb,c") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "AB": ["a\tb"],
                     "C": ["c"]
                 }),
                 [ParseCsvWarning.CleanedAsciiColumnNames(1, "AB")],
             ),
         )
コード例 #6
0
 def test_truncate_csv(self):
     with _temp_csv("A,B\na,b\nc,d\ne,f\ng,h") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": ["a", "c", "e"],
                     "B": ["b", "d", None]
                 }),
                 [ParseCsvWarning.TruncatedFile(19, 13)],
             ),
         )
コード例 #7
0
 def test_too_many_columns(self):
     with _temp_csv("A,B,C,D,E,F\na,b,c,d,e,f") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": ["a"],
                     "B": ["b"]
                 }),
                 [ParseCsvWarning.SkippedColumns(4, 2)],
             ),
         )
コード例 #8
0
 def test_repair_unexpected_eof(self):
     with _temp_csv('A,B\nx,"y\nz') as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": ["x"],
                     "B": ["y\nz"]
                 }),
                 [ParseCsvWarning.RepairedEndOfFile()],
             ),
         )
コード例 #9
0
 def test_repair_missing_quote(self):
     with _temp_csv('A,B\n"x" y,"z""" a') as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": ["x y"],
                     "B": ['z" a']
                 }),
                 [ParseCsvWarning.RepairedValues(2, 1, 0)],
             ),
         )
コード例 #10
0
 def test_rewrite_conflicting_column_headers(self):
     with _temp_csv("A,A,Column 4,\na,b,c,d") as path:
         result = _internal_parse_csv(path, has_header=True)
         assert_csv_result_equals(
             result,
             ParseCsvResult(
                 pa.table({
                     "A": ["a"],
                     "A 2": ["b"],  # rewritten
                     "Column 4": ["c"],
                     "Column 5": ["d"],  # rewritten
                 }),
                 [ParseCsvWarning.NumberedColumnNames(2, "A 2")],
             ),
         )
コード例 #11
0
 def test_warn_and_replace_on_invalid_encoding(self):
     # tests that `chardet` is invoked
     with _temp_csv("A\nfôo\ncafé".encode("windows-1252")) as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, encoding="utf-8", has_header=True),
             ParseCsvResult(
                 pa.table({"A": ["f�o", "caf�"]}),
                 [
                     ParseCsvWarning.RepairedEncoding(
                         encoding="utf-8",
                         first_invalid_byte=244,
                         first_invalid_byte_position=3,
                     )
                 ],
             ),
         )