Example #1
0
    def test_stream_table(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        with self.subTest(name="simple"):
            self._stream_test(table, dialect)

        table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="\\")
        with self.subTest(name="escaped"):
            self._stream_test(table, dialect)

        table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
        with self.subTest(name="quoted"):
            self._stream_test(table, dialect)

        table = [['a"A,0"b', "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
        with self.subTest(name="double"):
            self._stream_test(table, dialect)

        rows = ['1,"AA"', '2,"BB"', '3,"CC"']
        exp = [["1", "AA"], ["2", "BB"], ["3", "CC"]]
        with self.subTest(name="rowtest"):
            self._stream_test_rows(rows, exp)

        # This raises a NoDetectionResult due to the spacing after the
        # delimiter, which confuses the detection algorithm. Support for
        # detecting 'skipinitialspace' should fix this problem.
        rows = ['1, "AA"', '2, "BB"', '3, "CC"']
        exp = [["1", "AA"], ["2", "BB"], ["3", "CC"]]
        with self.subTest(name="raises2"):
            with self.assertRaises(NoDetectionResult):
                self._stream_test_rows(rows, exp)
Example #2
0
    def test_form_3(self):
        A = SimpleDialect(delimiter=",", quotechar="'", escapechar="")
        Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="")

        self.assertTrue(is_form_3('A,B\nC,"D"', Q))
        self.assertTrue(is_form_3('A,B\nC,"d,e"', Q))

        self.assertFalse(is_form_3('A,\nC,"d,e"', Q))
        self.assertFalse(is_form_3("3;4,B\nC,D", Q))

        self.assertFalse(is_form_3('A,B\n"C",D\n', A))
        self.assertTrue(is_form_3('A,B\n"C",D\n', Q))
Example #3
0
 def test_get_best_set_2(self):
     scores = {
         SimpleDialect(";", None, None): {
             "Q": None
         },
         SimpleDialect(",", None, None): {
             "Q": 1.0
         },
         SimpleDialect("|", None, None): {
             "Q": 2.0
         },
     }
     H = get_best_set(scores)
     self.assertEqual(H, set([SimpleDialect("|", None, None)]))
Example #4
0
 def test_abstraction_8(self):
     out = detect_pattern.make_abstraction(
         ',"",,\r\n',
         SimpleDialect(delimiter=",", quotechar='"', escapechar=""),
     )
     exp = "CDCDCDC"
     self.assertEqual(exp, out)
Example #5
0
 def test_abstraction_5(self):
     out = detect_pattern.make_abstraction(
         'a,"bc""d"",|"f|""',
         SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
     )
     exp = "CDC"
     self.assertEqual(exp, out)
Example #6
0
 def test_abstraction_3(self):
     out = detect_pattern.make_abstraction(
         "a,a,\n,a,a\ra,a,a\r\n",
         SimpleDialect(delimiter=",", quotechar="", escapechar=""),
     )
     exp = "CDCDCRCDCDCRCDCDC"
     self.assertEqual(exp, out)
Example #7
0
    def test_form_4(self):
        quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="")
        unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="")

        self.assertTrue(is_form_4("A\nB\nC", unquoted))
        self.assertTrue(is_form_4("1\n2\n3", unquoted))
        self.assertTrue(is_form_4("A_B\n1\n2", unquoted))
        self.assertTrue(is_form_4("A&B\n1\n2", unquoted))
        self.assertTrue(is_form_4("A&B\n-1\n2", unquoted))
        self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted))

        self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted))
        self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted))
        self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted))
        self.assertFalse(is_form_4('A\n"-1"\n2', unquoted))
        self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted))
Example #8
0
 def test_abstraction_9(self):
     out = detect_pattern.make_abstraction(
         "A,B|,C",
         SimpleDialect(delimiter=",", quotechar="", escapechar="|"),
     )
     exp = "CDC"
     self.assertEqual(exp, out)
Example #9
0
 def test_abstraction_10(self):
     out = detect_pattern.make_abstraction(
         'A,"B,C|"D"',
         SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
     )
     exp = "CDC"
     self.assertEqual(exp, out)
Example #10
0
    def test_code_5(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter="\t", quotechar="", escapechar="")

        tmpfname = self._build_file(table, dialect)

        application = build_application()
        command = application.find("code")
        tester = CommandTester(command)
        tester.execute(tmpfname)

        exp = f"""\

# Code generated with CleverCSV version {__version__}

import clevercsv

with open("{tmpfname}", "r", newline="", encoding="ascii") as fp:
    reader = clevercsv.reader(fp, delimiter="\\t", quotechar="", escapechar="")
    rows = list(reader)

"""
        try:
            output = tester.io.fetch_output()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #11
0
    def test_write(self):
        table = [["A", "B,C", "D"], [1, 2, 3], [4, 5, 6]]
        exp = 'A,"B,C",D\r\n1,2,3\r\n4,5,6\r\n'
        with self.subTest(name="default"):
            self._write_test(table, exp)

        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        exp = "A;B,C;D\n1;2;3\n4;5;6\n"
        with self.subTest(name="dialect"):
            self._write_test(table, exp, dialect=dialect)

        exp = "A;1;4\nB,C;2;5\nD;3;6\n"
        with self.subTest(name="transposed"):
            self._write_test(table, exp, dialect=dialect, transpose=True)

        table[2].append(8)
        with self.assertRaises(ValueError):
            self._write_test(table, "")

        table = [["Å", "B", "C"], [1, 2, 3], [4, 5, 6]]
        exp = "Å,B,C\r\n1,2,3\r\n4,5,6\r\n"
        with self.subTest(name="encoding_1"):
            # Not specifying an encoding here could potentially fail on
            # Windows, due to open() defaulting to
            # locale.getpreferredencoding() (see gh-27).
            self._write_test(table, exp, encoding="utf-8")

        with self.subTest(name="encoding_2"):
            self._write_test(table, exp, encoding="cp1252")
Example #12
0
    def test_form_5(self):
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")

        self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect))
        self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect))

        self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect))
        self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect))
        self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect))
Example #13
0
 def test_pattern_score_3(self):
     # theta_3 from paper
     data = (
         "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;"
         '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93'
     )
     d = SimpleDialect(delimiter=";", quotechar='"', escapechar="")
     out = detect_pattern.pattern_score(data, d)
     exp = 10 / 3
     self.assertAlmostEqual(exp, out)
Example #14
0
    def test_detect_base(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        with self.subTest(name="simple"):
            self._detect_test_wrap(table, dialect)

        table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="\\")
        with self.subTest(name="escaped"):
            self._detect_test_wrap(table, dialect)

        table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
        with self.subTest(name="quoted"):
            self._detect_test_wrap(table, dialect)

        table = [['a"A,0"b', "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
        with self.subTest(name="double"):
            self._detect_test_wrap(table, dialect)
Example #15
0
    def test_read_dataframe(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        with self.subTest(name="simple"):
            self._df_test(table, dialect)

        table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="\\")
        with self.subTest(name="escaped"):
            self._df_test(table, dialect)

        table = [["A,0", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
        with self.subTest(name="quoted"):
            self._df_test(table, dialect)

        table = [['a"A,0"b', "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
        with self.subTest(name="double"):
            self._df_test(table, dialect)

        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        with self.subTest(name="simple_nchar"):
            self._df_test(table, dialect, num_char=10)

        table = [["Ä", "Ð", "Ç"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        with self.subTest(name="simple_encoding"):
            self._df_test(table, dialect, num_char=10, encoding="latin1")
Example #16
0
    def test_form_1(self):
        dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")

        self.assertTrue(is_form_1('"A","B","C"', dialect))
        self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect))
        self.assertTrue(is_form_1('"A","","C"', dialect))

        self.assertFalse(is_form_1('"A","B"\n"A"', dialect))
        self.assertFalse(is_form_1('"A"\n"B"', dialect))
        self.assertFalse(is_form_1('"A"\n"A","B"', dialect))
        self.assertFalse(is_form_1('"A",,"C"', dialect))
        self.assertFalse(is_form_1('"A",C', dialect))
        self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect))
Example #17
0
 def test_type_score_1(self):
     # theta_1 from paper
     cells = [
         ["7", "5; Mon", " Jan 12;6", "40"],
         ["100; Fri", " Mar 21;8", "23"],
         ["8", "2; Thu", " Sep 17; 2", "71"],
         ["538", "0;;7", "26"],
         ['"NA"; Wed', " Oct 4;6", "93"],
     ]
     data = "\n".join([",".join(x) for x in cells])
     dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="")
     out = type_score(data, dialect)
     exp = 8 / 17
     self.assertAlmostEqual(exp, out)
Example #18
0
 def test_type_score_3(self):
     # theta_3 from paper
     cells = [
         ["7,5", " Mon, Jan 12", "6,40"],
         ["100", " Fri, Mar 21", "8,23"],
         ["8,2", " Thu, Sep 17", "2,71"],
         ["538,0", "", "7,26"],
         ["N/A", " Wed, Oct 4", "6,93"],
     ]
     data = "\r".join([";".join(x) for x in cells])
     dialect = SimpleDialect(delimiter=";", quotechar='"', escapechar="")
     out = type_score(data, dialect)
     exp = 11 / 15
     self.assertAlmostEqual(exp, out)
Example #19
0
    def test_form_2(self):
        dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="")

        self.assertTrue(is_form_2("1,2,3", dialect))
        self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect))
        self.assertTrue(is_form_2("[email protected],3", dialect))
        self.assertTrue(is_form_2("a,,3\n1,2,3", dialect))

        self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect))
        self.assertFalse(is_form_2("1", dialect))
        self.assertFalse(is_form_2('1,"a"', dialect))
        self.assertFalse(is_form_2("a;b,3", dialect))
        self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect))
        self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect))
Example #20
0
    def test_standardize_1(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        tester.execute(tmpfname)

        exp = "A,B,C\n1,2,3\n4,5,6"
        try:
            output = tester.io.fetch_output().strip()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #21
0
    def test_detect_opts_2(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        application = build_application()
        command = application.find("detect")
        tester = CommandTester(command)
        tester.execute(f"--num-chars 5 {tmpfname}")

        exp = "Detected: " + str(dialect)

        try:
            output = tester.io.fetch_output().strip()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #22
0
    def test_write(self):
        table = [["A", "B,C", "D"], [1, 2, 3], [4, 5, 6]]
        exp = 'A,"B,C",D\n1,2,3\n4,5,6\n'
        with self.subTest(name="default"):
            self._write_test(table, exp)

        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        exp = "A;B,C;D\n1;2;3\n4;5;6\n"
        with self.subTest(name="dialect"):
            self._write_test(table, exp, dialect=dialect)

        exp = "A;1;4\nB,C;2;5\nD;3;6\n"
        with self.subTest(name="transposed"):
            self._write_test(table, exp, dialect=dialect, transpose=True)

        table[2].append(8)
        with self.assertRaises(ValueError):
            self._write_test(table, "")
Example #23
0
    def test_standardize_3(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        tmpfd, tmpoutname = tempfile.mkstemp(suffix=".csv")
        os.close(tmpfd)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        tester.execute(f"-t {tmpfname}")

        exp = "A,1,4\nB,2,5\nC,3,6"

        try:
            output = tester.io.fetch_output().strip()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #24
0
    def test_standardize_1(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        tester.execute(tmpfname)

        # Excel format (i.e. RFC4180) *requires* CRLF
        crlf = "\r\n"
        exp = crlf.join(["A,B,C", "1,2,3", "4,5,6"])
        # add line terminator of last row
        exp += crlf
        try:
            output = tester.io.fetch_output()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #25
0
    def test_standardize_2(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        tmpfd, tmpoutname = tempfile.mkstemp(suffix=".csv")
        os.close(tmpfd)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        tester.execute(f"-o {tmpoutname} {tmpfname}")

        exp = "A,B,C\n1,2,3\n4,5,6\n"
        with open(tmpoutname, "r") as fp:
            output = fp.read()

        try:
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
            os.unlink(tmpoutname)
Example #26
0
    def test_standardize_3(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv")
        os.close(tmpfd)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        tester.execute(f"-t {tmpfname}")

        # Excel format (i.e. RFC4180) *requires* CRLF
        crlf = "\r\n"
        exp = crlf.join(["A,1,4", "B,2,5", "C,3,6"])
        # add line terminator of last row
        exp += crlf

        try:
            output = tester.io.fetch_output()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #27
0
    def test_standardize_in_place(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        retcode = tester.execute(f"-i {tmpfname}")

        self.assertEqual(retcode, 2)

        # Excel format (i.e. RFC4180) *requires* CRLF
        crlf = "\r\n"
        exp = crlf.join(["A,B,C", "1,2,3", "4,5,6"])
        # add line terminator of last row
        exp += crlf

        try:
            with open(tmpfname, "r", newline="") as fp:
                contents = fp.read()
            self.assertEqual(exp, contents)
        finally:
            os.unlink(tmpfname)
Example #28
0
    def test_code_2(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        application = build_application()
        command = application.find("code")
        tester = CommandTester(command)
        tester.execute(f"-p {tmpfname}")

        exp = f"""\

# Code generated with CleverCSV version {__version__}

import clevercsv

df = clevercsv.csv2df("{tmpfname}", delimiter=";", quotechar="", escapechar="")

"""
        try:
            output = tester.io.fetch_output()
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
Example #29
0
    def test_standardize_2(self):
        table = [["A", "B", "C"], [1, 2, 3], [4, 5, 6]]
        dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
        tmpfname = self._build_file(table, dialect)

        tmpfd, tmpoutname = tempfile.mkstemp(suffix=".csv")
        os.close(tmpfd)

        application = build_application()
        command = application.find("standardize")
        tester = CommandTester(command)
        tester.execute(f"-o {tmpoutname} {tmpfname}")

        # Excel format (i.e. RFC4180) *requires* CRLF
        crlf = "\r\n"
        exp = crlf.join(["A,B,C", "1,2,3", "4,5,6", ""])
        with open(tmpoutname, "r", newline='') as fp:
            output = fp.read()

        try:
            self.assertEqual(exp, output)
        finally:
            os.unlink(tmpfname)
            os.unlink(tmpoutname)
Example #30
0
 def test_write_simpledialect(self):
     self._write_test(
         ["a", 1, "p,q"],
         "a,1,|p,q|",
         dialect=SimpleDialect(delimiter=",", quotechar="|", escapechar=""),
     )