def test_stream_table(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") with self.subTest(name="simple"): self._stream_test(table, dialect) table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="\\") with self.subTest(name="escaped"): self._stream_test(table, dialect) table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") with self.subTest(name="quoted"): self._stream_test(table, dialect) table = [['a"A,0"b', "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") with self.subTest(name="double"): self._stream_test(table, dialect) rows = ['1,"AA"', '2,"BB"', '3,"CC"'] exp = [["1", "AA"], ["2", "BB"], ["3", "CC"]] with self.subTest(name="rowtest"): self._stream_test_rows(rows, exp) # This raises a NoDetectionResult due to the spacing after the # delimiter, which confuses the detection algorithm. Support for # detecting 'skipinitialspace' should fix this problem. rows = ['1, "AA"', '2, "BB"', '3, "CC"'] exp = [["1", "AA"], ["2", "BB"], ["3", "CC"]] with self.subTest(name="raises2"): with self.assertRaises(NoDetectionResult): self._stream_test_rows(rows, exp)
def test_form_3(self): A = SimpleDialect(delimiter=",", quotechar="'", escapechar="") Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="") self.assertTrue(is_form_3('A,B\nC,"D"', Q)) self.assertTrue(is_form_3('A,B\nC,"d,e"', Q)) self.assertFalse(is_form_3('A,\nC,"d,e"', Q)) self.assertFalse(is_form_3("3;4,B\nC,D", Q)) self.assertFalse(is_form_3('A,B\n"C",D\n', A)) self.assertTrue(is_form_3('A,B\n"C",D\n', Q))
def test_get_best_set_2(self): scores = { SimpleDialect(";", None, None): { "Q": None }, SimpleDialect(",", None, None): { "Q": 1.0 }, SimpleDialect("|", None, None): { "Q": 2.0 }, } H = get_best_set(scores) self.assertEqual(H, set([SimpleDialect("|", None, None)]))
def test_abstraction_8(self): out = detect_pattern.make_abstraction( ',"",,\r\n', SimpleDialect(delimiter=",", quotechar='"', escapechar=""), ) exp = "CDCDCDC" self.assertEqual(exp, out)
def test_abstraction_5(self): out = detect_pattern.make_abstraction( 'a,"bc""d"",|"f|""', SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), ) exp = "CDC" self.assertEqual(exp, out)
def test_abstraction_3(self): out = detect_pattern.make_abstraction( "a,a,\n,a,a\ra,a,a\r\n", SimpleDialect(delimiter=",", quotechar="", escapechar=""), ) exp = "CDCDCRCDCDCRCDCDC" self.assertEqual(exp, out)
def test_form_4(self): quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="") unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="") self.assertTrue(is_form_4("A\nB\nC", unquoted)) self.assertTrue(is_form_4("1\n2\n3", unquoted)) self.assertTrue(is_form_4("A_B\n1\n2", unquoted)) self.assertTrue(is_form_4("A&B\n1\n2", unquoted)) self.assertTrue(is_form_4("A&B\n-1\n2", unquoted)) self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted)) self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted)) self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted)) self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted)) self.assertFalse(is_form_4('A\n"-1"\n2', unquoted)) self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted))
def test_abstraction_9(self): out = detect_pattern.make_abstraction( "A,B|,C", SimpleDialect(delimiter=",", quotechar="", escapechar="|"), ) exp = "CDC" self.assertEqual(exp, out)
def test_abstraction_10(self): out = detect_pattern.make_abstraction( 'A,"B,C|"D"', SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), ) exp = "CDC" self.assertEqual(exp, out)
def test_code_5(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter="\t", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) application = build_application() command = application.find("code") tester = CommandTester(command) tester.execute(tmpfname) exp = f"""\ # Code generated with CleverCSV version {__version__} import clevercsv with open("{tmpfname}", "r", newline="", encoding="ascii") as fp: reader = clevercsv.reader(fp, delimiter="\\t", quotechar="", escapechar="") rows = list(reader) """ try: output = tester.io.fetch_output() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_write(self): table = [["A", "B,C", "D"], [1, 2, 3], [4, 5, 6]] exp = 'A,"B,C",D\r\n1,2,3\r\n4,5,6\r\n' with self.subTest(name="default"): self._write_test(table, exp) dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") exp = "A;B,C;D\n1;2;3\n4;5;6\n" with self.subTest(name="dialect"): self._write_test(table, exp, dialect=dialect) exp = "A;1;4\nB,C;2;5\nD;3;6\n" with self.subTest(name="transposed"): self._write_test(table, exp, dialect=dialect, transpose=True) table[2].append(8) with self.assertRaises(ValueError): self._write_test(table, "") table = [["Å", "B", "C"], [1, 2, 3], [4, 5, 6]] exp = "Å,B,C\r\n1,2,3\r\n4,5,6\r\n" with self.subTest(name="encoding_1"): # Not specifying an encoding here could potentially fail on # Windows, due to open() defaulting to # locale.getpreferredencoding() (see gh-27). self._write_test(table, exp, encoding="utf-8") with self.subTest(name="encoding_2"): self._write_test(table, exp, encoding="cp1252")
def test_form_5(self): dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect)) self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect)) self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect)) self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect)) self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect))
def test_pattern_score_3(self): # theta_3 from paper data = ( "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;" '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93' ) d = SimpleDialect(delimiter=";", quotechar='"', escapechar="") out = detect_pattern.pattern_score(data, d) exp = 10 / 3 self.assertAlmostEqual(exp, out)
def test_detect_base(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") with self.subTest(name="simple"): self._detect_test_wrap(table, dialect) table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="\\") with self.subTest(name="escaped"): self._detect_test_wrap(table, dialect) table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") with self.subTest(name="quoted"): self._detect_test_wrap(table, dialect) table = [['a"A,0"b', "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") with self.subTest(name="double"): self._detect_test_wrap(table, dialect)
def test_read_dataframe(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") with self.subTest(name="simple"): self._df_test(table, dialect) table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="\\") with self.subTest(name="escaped"): self._df_test(table, dialect) table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") with self.subTest(name="quoted"): self._df_test(table, dialect) table = [['a"A,0"b', "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") with self.subTest(name="double"): self._df_test(table, dialect) table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") with self.subTest(name="simple_nchar"): self._df_test(table, dialect, num_char=10) table = [["Ä", "Ð", "Ç"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") with self.subTest(name="simple_encoding"): self._df_test(table, dialect, num_char=10, encoding="latin1")
def test_form_1(self): dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") self.assertTrue(is_form_1('"A","B","C"', dialect)) self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect)) self.assertTrue(is_form_1('"A","","C"', dialect)) self.assertFalse(is_form_1('"A","B"\n"A"', dialect)) self.assertFalse(is_form_1('"A"\n"B"', dialect)) self.assertFalse(is_form_1('"A"\n"A","B"', dialect)) self.assertFalse(is_form_1('"A",,"C"', dialect)) self.assertFalse(is_form_1('"A",C', dialect)) self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect))
def test_type_score_1(self): # theta_1 from paper cells = [ ["7", "5; Mon", " Jan 12;6", "40"], ["100; Fri", " Mar 21;8", "23"], ["8", "2; Thu", " Sep 17; 2", "71"], ["538", "0;;7", "26"], ['"NA"; Wed', " Oct 4;6", "93"], ] data = "\n".join([",".join(x) for x in cells]) dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="") out = type_score(data, dialect) exp = 8 / 17 self.assertAlmostEqual(exp, out)
def test_type_score_3(self): # theta_3 from paper cells = [ ["7,5", " Mon, Jan 12", "6,40"], ["100", " Fri, Mar 21", "8,23"], ["8,2", " Thu, Sep 17", "2,71"], ["538,0", "", "7,26"], ["N/A", " Wed, Oct 4", "6,93"], ] data = "\r".join([";".join(x) for x in cells]) dialect = SimpleDialect(delimiter=";", quotechar='"', escapechar="") out = type_score(data, dialect) exp = 11 / 15 self.assertAlmostEqual(exp, out)
def test_form_2(self): dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="") self.assertTrue(is_form_2("1,2,3", dialect)) self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect)) self.assertTrue(is_form_2("[email protected],3", dialect)) self.assertTrue(is_form_2("a,,3\n1,2,3", dialect)) self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect)) self.assertFalse(is_form_2("1", dialect)) self.assertFalse(is_form_2('1,"a"', dialect)) self.assertFalse(is_form_2("a;b,3", dialect)) self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect)) self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect))
def test_standardize_1(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) application = build_application() command = application.find("standardize") tester = CommandTester(command) tester.execute(tmpfname) exp = "A,B,C\n1,2,3\n4,5,6" try: output = tester.io.fetch_output().strip() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_detect_opts_2(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) application = build_application() command = application.find("detect") tester = CommandTester(command) tester.execute(f"--num-chars 5 {tmpfname}") exp = "Detected: " + str(dialect) try: output = tester.io.fetch_output().strip() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_write(self): table = [["A", "B,C", "D"], [1, 2, 3], [4, 5, 6]] exp = 'A,"B,C",D\n1,2,3\n4,5,6\n' with self.subTest(name="default"): self._write_test(table, exp) dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") exp = "A;B,C;D\n1;2;3\n4;5;6\n" with self.subTest(name="dialect"): self._write_test(table, exp, dialect=dialect) exp = "A;1;4\nB,C;2;5\nD;3;6\n" with self.subTest(name="transposed"): self._write_test(table, exp, dialect=dialect, transpose=True) table[2].append(8) with self.assertRaises(ValueError): self._write_test(table, "")
def test_standardize_3(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) tmpfd, tmpoutname = tempfile.mkstemp(suffix=".csv") os.close(tmpfd) application = build_application() command = application.find("standardize") tester = CommandTester(command) tester.execute(f"-t {tmpfname}") exp = "A,1,4\nB,2,5\nC,3,6" try: output = tester.io.fetch_output().strip() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_standardize_1(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) application = build_application() command = application.find("standardize") tester = CommandTester(command) tester.execute(tmpfname) # Excel format (i.e. RFC4180) *requires* CRLF crlf = "\r\n" exp = crlf.join(["A,B,C", "1,2,3", "4,5,6"]) # add line terminator of last row exp += crlf try: output = tester.io.fetch_output() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_standardize_2(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) tmpfd, tmpoutname = tempfile.mkstemp(suffix=".csv") os.close(tmpfd) application = build_application() command = application.find("standardize") tester = CommandTester(command) tester.execute(f"-o {tmpoutname} {tmpfname}") exp = "A,B,C\n1,2,3\n4,5,6\n" with open(tmpoutname, "r") as fp: output = fp.read() try: self.assertEqual(exp, output) finally: os.unlink(tmpfname) os.unlink(tmpoutname)
def test_standardize_3(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv") os.close(tmpfd) application = build_application() command = application.find("standardize") tester = CommandTester(command) tester.execute(f"-t {tmpfname}") # Excel format (i.e. RFC4180) *requires* CRLF crlf = "\r\n" exp = crlf.join(["A,1,4", "B,2,5", "C,3,6"]) # add line terminator of last row exp += crlf try: output = tester.io.fetch_output() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_standardize_in_place(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) application = build_application() command = application.find("standardize") tester = CommandTester(command) retcode = tester.execute(f"-i {tmpfname}") self.assertEqual(retcode, 2) # Excel format (i.e. RFC4180) *requires* CRLF crlf = "\r\n" exp = crlf.join(["A,B,C", "1,2,3", "4,5,6"]) # add line terminator of last row exp += crlf try: with open(tmpfname, "r", newline="") as fp: contents = fp.read() self.assertEqual(exp, contents) finally: os.unlink(tmpfname)
def test_code_2(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) application = build_application() command = application.find("code") tester = CommandTester(command) tester.execute(f"-p {tmpfname}") exp = f"""\ # Code generated with CleverCSV version {__version__} import clevercsv df = clevercsv.csv2df("{tmpfname}", delimiter=";", quotechar="", escapechar="") """ try: output = tester.io.fetch_output() self.assertEqual(exp, output) finally: os.unlink(tmpfname)
def test_standardize_2(self): table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]] dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") tmpfname = self._build_file(table, dialect) tmpfd, tmpoutname = tempfile.mkstemp(suffix=".csv") os.close(tmpfd) application = build_application() command = application.find("standardize") tester = CommandTester(command) tester.execute(f"-o {tmpoutname} {tmpfname}") # Excel format (i.e. RFC4180) *requires* CRLF crlf = "\r\n" exp = crlf.join(["A,B,C", "1,2,3", "4,5,6", ""]) with open(tmpoutname, "r", newline='') as fp: output = fp.read() try: self.assertEqual(exp, output) finally: os.unlink(tmpfname) os.unlink(tmpoutname)
def test_write_simpledialect(self): self._write_test( ["a", 1, "p,q"], "a,1,|p,q|", dialect=SimpleDialect(delimiter=",", quotechar="|", escapechar=""), )