def test_normal_excel(self, tmpdir): file_path = "/tmp/valid/test/data/validdata.xlsx" p_file_path = Path(str(tmpdir.join(file_path))) p_file_path.parent.makedirs_p() tabledata_list = [ TableData( "testsheet1", ["a1", "b1", "c1"], [["aa1", "ab1", "ac1"], [1.0, 1.1, "a"], [2.0, 2.2, "bb"], [3.0, 3.3, 'cc"dd"']], ), TableData( "testsheet3", ["a3", "b3", "c3"], [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]], ), ] writer = ExcelXlsxTableWriter() writer.open(p_file_path) for tabledata in tabledata_list: writer.from_tabledata(tabledata) writer.write_table() writer.close() loader = ptr.TableFileLoader(p_file_path) assert loader.format_name == "excel" for tabledata in loader.load(): print(dump_tabledata(tabledata)) assert tabledata in tabledata_list
def test_normal_default_key(self): headers = ["a", "b"] assert TableData(None, headers, []).as_dict() == {"table": []} assert TableData("", headers, []).as_dict(default_key="dummy") == { "dummy": [] }
def test_normal_multi_sheet(self, tmpdir): for writer_class in table_writer_class_list: test_filepath = str(tmpdir.join("test.xlsx")) data_list = [ TableData( table_name="first", header_list=["ha1", "hb1", "hc1"], row_list=[[1.0, 2.0, 3.0], [11.0, 12.0, 13.0]], ), TableData( table_name="second", header_list=["ha2", "hb2", "hc2"], row_list=[[11.0, 12.0, 13.0], [1.0, 2.0, 3.0]], ), ] writer = writer_class() for data in data_list: writer.from_tabledata(data) writer.dump(test_filepath, close_after_write=False) writer.close() for data, expected in zip( data_list, ExcelTableFileLoader(test_filepath).load()): assert data == expected
def test_normal_add_primary_key_column(self, tmpdir): p = tmpdir.join("tmp.db") con = SimpleSQLite(str(p), "w") table_name = "table1" con.create_table_from_data_matrix( table_name=table_name, attr_names=["AA", "BB"], data_matrix=[["a", 11], ["bb", 12]], add_primary_key_column=True, ) assert con.select_as_tabledata(table_name) == TableData( table_name=table_name, headers=["id", "AA", "BB"], rows=[[1, "a", 11], [2, "bb", 12]]) assert con.schema_extractor.fetch_table_schema( table_name).primary_key == "id" table_name = "table2" con.create_table_from_data_matrix( table_name=table_name, attr_names=["AA", "BB"], data_matrix=[["a", 11], ["bb", 12]], primary_key="pkey", add_primary_key_column=True, ) assert con.select_as_tabledata(table_name) == TableData( table_name=table_name, headers=["pkey", "AA", "BB"], rows=[[1, "a", 11], [2, "bb", 12]]) assert con.schema_extractor.fetch_table_schema( table_name).primary_key == "pkey"
def test_normal_multi_table(self, tmpdir): test_filepath = str(tmpdir.join("test.sqlite")) data_list = [ TableData( table_name="first", header_list=["ha1", "hb1", "hc1"], row_list=[[1.0, 2.0, 3.0], [11.0, 12.0, 13.0]], ), TableData( table_name="second", header_list=["ha2", "hb2", "hc2"], row_list=[[11.0, 12.0, 13.0], [1.0, 2.0, 3.0]], ), ] writer = ptw.SqliteTableWriter() for data in data_list: writer.from_tabledata(data) writer.dump(test_filepath, close_after_write=False) writer.close() count = 0 for data, expected in zip(data_list, SqliteFileLoader(test_filepath).load()): assert data == expected count += 1 assert count == 2
def test_normal_excel(self): url = "https://github.com/thombashi/valid/test/data/validdata.xlsx" data_path = os.path.join(os.path.dirname(__file__), "data/validdata.xlsx") with open(data_path, "rb") as f: responses.add( responses.GET, url, body=f.read(), content_type="application/octet-stream", status=200, ) expeced_list = [ TableData( "testsheet1", ["a1", "b1", "c1"], [["aa1", "ab1", "ac1"], [1.0, 1.1, "a"], [2.0, 2.2, "bb"], [3.0, 3.3, "cc"]], ), TableData( "testsheet3", ["a3", "b3", "c3"], [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]], ), ] loader = ptr.TableUrlLoader(url) assert loader.format_name == "excel" for table_data in loader.load(): assert table_data.in_tabledata_list(expeced_list)
class Test_TableData_equals: __LHS = TableData("tablename", ["a", "b"], [{ "a": 1, "b": 2 }, { "a": 11, "b": 12 }]) __RHS = TableData("tablename", ["a", "b"], [[1, 2], [11, 12]]) @pytest.mark.parametrize( ["lhs", "rhs", "cmp_by_dp", "expected"], [[__LHS, __RHS, True, True], [__LHS, __RHS, False, False]], ) def test_normal(self, lhs, rhs, cmp_by_dp, expected): empty_td = TableData("tablename", ["a", "b"], None) assert lhs.equals(rhs, cmp_by_dp=cmp_by_dp) == expected assert lhs.equals(empty_td, cmp_by_dp=cmp_by_dp) is False assert empty_td.equals(rhs, cmp_by_dp=cmp_by_dp) is False assert (lhs == rhs) is False assert (lhs != rhs) is True assert lhs.in_tabledata_list([rhs, empty_td], cmp_by_dp=cmp_by_dp) == expected assert lhs.in_tabledata_list([lhs, empty_td], cmp_by_dp=cmp_by_dp) assert lhs.in_tabledata_list([rhs, lhs, empty_td], cmp_by_dp=cmp_by_dp) assert empty_td.in_tabledata_list([rhs, lhs], cmp_by_dp=cmp_by_dp) is False
def test_normal_multi_table(self, con_a0, con_b0): out_db_path = "test.sqlite" runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cmd, [ "-o", out_db_path, "file", con_a0.database_path, con_b0.database_path ]) print_traceback(result) assert result.exit_code == ExitCode.SUCCESS expected_list = [ TableData(TEST_TABLE_NAME_A, ["attr_a", "attr_b"], [[1, 2], [3, 4]]), TableData(TEST_TABLE_NAME_B, ["ba", "bb"], [[101, 102], [103, 104]]), ] for tabledata in SqliteFileLoader(out_db_path).load(): if tabledata.table_name == SourceInfo.get_table_name(): continue print("[actual]\n{}".format(tabledata)) for record in tabledata.value_matrix: print(" {}".format(record)) assert tabledata in expected_list
class Test_TableData_transpose: @pytest.mark.parametrize( ["value", "expected"], [[ TableData("tablename", ["a", "b"], [[1, 2, 3], [1, 2, 3]]), TableData("tablename", ["a", "b"], [[1, 1], [2, 2], [3, 3]]), ]], ) def test_normal(self, value, expected): assert value.transpose() == expected
class Test_SQLiteTableDataSanitizer_dup_col_handler: @pytest.mark.parametrize( ["table_name", "headers", "dup_col_handler", "expected"], [ [ "all attrs are duplicated", ["A", "A", "A", "A", "A"], "rename", TableData("all_attrs_are_duplicated", ["A", "A_1", "A_2", "A_3", "A_4"], []), ], [ "recursively duplicated attrs", ["A", "A", "A_1", "A_1", "A_2", "A_1_1", "A_1_1"], "recursively_duplicated_attrs", TableData( "recursively_duplicated_attrs", ["A", "A_3", "A_1", "A_1_2", "A_2", "A_1_1", "A_1_1_1"], [], ), ], ], ) def test_normal_(self, table_name, headers, dup_col_handler, expected): new_tabledata = SQLiteTableDataSanitizer( TableData(table_name, headers, []), dup_col_handler=dup_col_handler).normalize() try: from pytablewriter import dumps_tabledata print_test_result(expected=dumps_tabledata(expected), actual=dumps_tabledata(new_tabledata)) except ImportError: pass assert new_tabledata.equals(expected) @pytest.mark.parametrize( ["table_name", "headers", "expected"], [ ["duplicate columns", ["a", "a"], ValueError], ["duplicate columns", ["AA", "b", "AA"], ValueError], ], ) def test_exception(self, table_name, headers, expected): with pytest.raises(expected): SQLiteTableDataSanitizer(TableData(table_name, headers, []), dup_col_handler="error").normalize()
def to_table_data(self): if typepy.is_empty_sequence(self._loader.header_list): header_list = self._source_data[0] if any([typepy.is_null_string(header) for header in header_list]): raise InvalidDataError( "the first line includes empty string item." "all of the items should contain header name." "actual={}".format(header_list)) data_matrix = self._source_data[1:] else: header_list = self._loader.header_list data_matrix = self._source_data if not data_matrix: raise InvalidDataError( "data row must be greater or equal than one") self._loader.inc_table_count() yield TableData(self._loader.make_table_name(), header_list, data_matrix, quoting_flags=self._loader.quoting_flags)
def normalize_table(self, table_data: TableData, dup_col_handler=None) -> TableData: from pathvalidate import replace_symbol, replace_unprintable_char from simplesqlite import SQLiteTableDataSanitizer if dup_col_handler is None: dup_col_handler = DEFAULT_DUP_COL_HANDLER normalized_table_data = SQLiteTableDataSanitizer( table_data, dup_col_handler=dup_col_handler, is_type_inference=self._is_type_inference, max_workers=self._max_workers, ).normalize() if self._symbol_replace_value is None: return normalized_table_data return TableData( normalized_table_data.table_name, [ replace_symbol( replace_unprintable_char(header), self._symbol_replace_value, is_replace_consecutive_chars=True, is_strip=True, ) for header in normalized_table_data.headers ], normalized_table_data.rows, dp_extractor=normalized_table_data.dp_extractor, type_hints=table_data.dp_extractor.column_type_hints, )
def test_normal_single_tabledata(self, capsys): writer = table_writer_class() writer.from_tabledata( TableData( "loader_mapping", ["Name", "Loader"], [ ["csv", "CsvTableFileLoader"], ["excel", "ExcelTableFileLoader"], ["html", "HtmlTableFileLoader"], ["markdown", "MarkdownTableFileLoader"], ["mediawiki", "MediaWikiTableFileLoader"], ["json", "JsonTableFileLoader"], ["Long Format Name", "Loader"], ], )) writer.write_table() expected = dedent("""\ # loader_mapping | Name | Loader | |----------------|------------------------| |csv |CsvTableFileLoader | |excel |ExcelTableFileLoader | |html |HtmlTableFileLoader | |markdown |MarkdownTableFileLoader | |mediawiki |MediaWikiTableFileLoader| |json |JsonTableFileLoader | |Long Format Name|Loader | """) out, err = capsys.readouterr() print_test_result(expected=expected, actual=out, error=err) assert out == expected
def test_normal_json(self): text = dedent( """\ [ {"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"} ]""" ) expected_list = [ TableData( "json1", ["attr_a", "attr_b", "attr_c"], [{"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"}], ) ] loader = ptr.TableTextLoader(text, format_name="json") assert loader.format_name == "json" for table_data, expected in zip(loader.load(), expected_list): print(dumps_tabledata(expected)) print(dumps_tabledata(table_data)) assert table_data.equals(expected)
def test_normal_csv(self): text = dedent( """\ "attr_a","attr_b","attr_c" 1,4,"a" 2,2.1,"bb" 3,120.9,"ccc" """ ) expected_list = [ TableData( "csv1", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], ) ] loader = ptr.TableTextLoader(text, format_name="csv") assert loader.format_name == "csv" for tabledata, expected in zip(loader.load(), expected_list): print(dumps_tabledata(expected)) print(dumps_tabledata(tabledata)) assert tabledata.equals(expected)
def test_normal(self, table_name, headers, rows, expected): for lhs, rhs in zip( TableData(table_name, headers, rows).as_tuple(), expected): print(f"lhs: {lhs}", file=sys.stderr) print(f"rhs: {rhs}", file=sys.stderr) assert tuple(lhs) == rhs
def to_table_data(self): if typepy.is_empty_sequence(self._loader.headers): headers = self._source_data[0] if any([typepy.is_null_string(header) for header in headers]): raise DataError("the first line includes empty string item." "all of the items should contain header name." "actual={}".format(headers)) data_matrix = self._source_data[1:] else: headers = self._loader.headers data_matrix = self._source_data if not data_matrix: raise DataError("data row must be greater or equal than one") self._loader.inc_table_count() yield TableData( self._loader.make_table_name(), headers, data_matrix, dp_extractor=self._loader.dp_extractor, type_hints=self._extract_type_hints(headers), )
def test_normal_csv(self, tmpdir, file_path, format_name): filename = pv.replace_symbol(file_path, "") p_file_path = Path(six.text_type(tmpdir.join(filename + Path(file_path).ext))) p_file_path.parent.makedirs_p() with open(p_file_path, "w") as f: f.write( dedent( """\ "attr_a","attr_b","attr_c" 1,4,"a" 2,2.1,"bb" 3,120.9,"ccc" """ ) ) expeced_list = [ TableData( filename, ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], ) ] loader = ptr.TableFileLoader(p_file_path, format_name=format_name) assert loader.format_name == "csv" for tabledata, expected in zip(loader.load(), expeced_list): print(dump_tabledata(expected)) print(dump_tabledata(tabledata)) assert tabledata.equals(expected)
def test_normal_json(self, tmpdir, file_path, format_name): p_file_path = Path(str(tmpdir.join(file_path))) p_file_path.parent.makedirs_p() with open(p_file_path, "w") as f: f.write( dedent( """\ [ {"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"} ]""" ) ) expeced_list = [ TableData( "validdata", ["attr_a", "attr_b", "attr_c"], [{"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"}], ) ] loader = ptr.TableFileLoader(p_file_path, format_name=format_name) assert loader.format_name == "json" for table_data, expected in zip(loader.load(), expeced_list): assert table_data.equals(expected)
def test_normal_single_sheet(self, tmpdir): for writer_class in table_writer_class_list: test_filepath = str(tmpdir.join("test.xlsx")) data = TableData( table_name="tablename", header_list=["ha", "hb", "hc"], row_list=[ [1.0, 2.0, 3.0], [11.0, 12.0, 13.0], [1.0, 2.0, 3.0], [11.0, 12.0, 13.0], [101.0, 102.0, 103.0], [1001.0, 1002.0, 1003.0], ], ) writer = writer_class() writer.from_tabledata(data) writer.dump(test_filepath) assert writer.first_data_row == 1 assert writer.last_data_row == 7 for expected in ExcelTableFileLoader(test_filepath).load(): assert data == expected
def test_normal_single_tabledata(self, capsys): writer = table_writer_class() writer.from_tabledata( TableData(table_name="loader_mapping", header_list=['Name', 'Loader'], record_list=[ ['csv', 'CsvTableFileLoader'], ['excel', 'ExcelTableFileLoader'], ['html', 'HtmlTableFileLoader'], ['markdown', 'MarkdownTableFileLoader'], ['mediawiki', 'MediaWikiTableFileLoader'], ['json', 'JsonTableFileLoader'], ['Long Format Name', 'Loader'], ])) writer.write_table() expected = dedent("""\ # loader_mapping | Name | Loader | |----------------|------------------------| |csv |CsvTableFileLoader | |excel |ExcelTableFileLoader | |html |HtmlTableFileLoader | |markdown |MarkdownTableFileLoader | |mediawiki |MediaWikiTableFileLoader| |json |JsonTableFileLoader | |Long Format Name|Loader | """) out, _err = capsys.readouterr() print_test_result(expected=expected, actual=out) assert out == expected
def generate(self, provider_list, rows, table_name=None, headers=None): """Generate fake data as tabular data. Args: provider_list (list): List of provider names to generate a tabular data. rows (int): Number of rows in the tabular data. headers (list): List of header names. Returns: tabledata.TableData: Generated fake tabular data. """ self.__validate_provider(provider_list) if rows < 0: raise ValueError("invalid rows") return TableData( table_name, headers if headers else provider_list, [[ getattr(self.__fake, faker_name)() for faker_name in provider_list ] for _row in range(rows)], )
def __parse_html(self, table): header_list = [] data_matrix = [] self.__parse_tag_id(table) row_list = table.find_all("tr") re_table_val = re.compile("td|th") for row in row_list: td_list = row.find_all("td") if typepy.is_empty_sequence(td_list): if typepy.is_not_empty_sequence(header_list): continue th_list = row.find_all("th") if typepy.is_empty_sequence(th_list): continue header_list = [row.text.strip() for row in th_list] continue data_matrix.append([value.get_text().strip() for value in row.find_all(re_table_val)]) if typepy.is_empty_sequence(data_matrix): raise ValueError("data matrix is empty") self._loader.inc_table_count() return TableData( self._make_table_name(), header_list, data_matrix, dp_extractor=self._loader.dp_extractor, )
def test_normal(self): writer_rhs = table_writer_class() writer_rhs.from_tabledata( TableData( "loader_mapping", ["Name", "Loader"], [ ["csv", "CsvTableFileLoader"], ["excel", "ExcelTableFileLoader"], ["html", "HtmlTableFileLoader"], ["markdown", "MarkdownTableFileLoader"], ["mediawiki", "MediaWikiTableFileLoader"], ["json", "JsonTableFileLoader"], ["Long Format Name", "Loader"], ], )) rhs = writer_rhs.dumps() writer_lhs = table_writer_class() writer_lhs.from_writer(writer_rhs) lhs = writer_lhs.dumps() print_test_result(expected=lhs, actual=rhs) assert lhs == rhs
def test_normal_margin_2(self, capsys): writer = table_writer_class() writer.from_tabledata( TableData(table_name="margin 2", header_list=header_list, row_list=value_matrix)) writer.margin = 2 writer.write_table() expected = dedent("""\ .. table:: margin 2 +-----+---------+-------+-------+--------+ | a | b | c | dd | e | +=====+=========+=======+=======+========+ | 1 | 123.1 | a | 1.0 | 1 | +-----+---------+-------+-------+--------+ | 2 | 2.2 | bb | 2.2 | 2.2 | +-----+---------+-------+-------+--------+ | 3 | 3.3 | ccc | 3.0 | cccc | +-----+---------+-------+-------+--------+ """) out, err = capsys.readouterr() print_test_result(expected=expected, actual=out, error=err) assert out == expected
def test_normal_csv(self, url, format_name): responses.add( responses.GET, url, body=dedent("""\ "attr_a","attr_b","attr_c" 1,4,"a" 2,2.1,"bb" 3,120.9,"ccc" """), content_type="text/plain; charset=utf-8", status=200, ) expeced_list = [ TableData( "csv1", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], ) ] loader = ptr.TableUrlLoader(url, format_name) assert loader.format_name == "csv" for table_data in loader.load(): assert table_data.in_tabledata_list(expeced_list)
def tabledata(self): """ :return: Table data. :rtype: tabledata.TableData """ return TableData(self.table_name, self.header_list, self.value_matrix)
def test_normal_style_thousand_separator(self, capsys): writer = table_writer_class() writer.from_tabledata( TableData( "", [ "none_format", "thousand_separator_i", "thousand_separator_f", "f", "wo_f" ], [ [1000, 1234567, 1234567.8, 1234.5678, 1234567.8], [1000, 1234567, 1234567.8, 1234.5678, 1234567.8], ], )) writer.column_styles = [ Style(thousand_separator=ThousandSeparator.NONE), Style(thousand_separator=ThousandSeparator.COMMA), Style(thousand_separator=ThousandSeparator.UNDERSCORE), Style(thousand_separator=ThousandSeparator.SPACE), ] out = writer.dumps() expected = dedent("""\ |none_format|thousand_separator_i|thousand_separator_f| f | wo_f | |----------:|-------------------:|-------------------:|------:|--------:| | 1000| 1,234,567| 1_234_567.8|1 234.6|1234567.8| | 1000| 1,234,567| 1_234_567.8|1 234.6|1234567.8| """) print_test_result(expected=expected, actual=out) assert out == expected
def test_normal_escape_html_tag_from_tabledata(self, capsys): writer = table_writer_class() writer.from_tabledata( TableData( table_name="", header_list=["no", "text"], row_list=[[ 1, "<caption>Table 'formatting for Jupyter Notebook.</caption>" ]], )) writer.is_escape_html_tag = True writer.write_table() expected = dedent("""\ |no | text | |--:|---------------------------------------------------------------------------| | 1|<caption>Table 'formatting for Jupyter Notebook.</caption>| """) out, err = capsys.readouterr() print_test_result(expected=expected, actual=out, error=err) assert out == expected
class Test_SimpleSQLite_select_as_tabledata: @pytest.mark.parametrize( ["value", "type_hints", "expected"], [[ TableData( "typehints", ["aa", "ab", "ac"], [[1, 4, "10"], [2, 2.1, "11"], [3, 120.9, "12"]], ), { "aa": typepy.String, "cc": typepy.Integer }, [["1", 4, 10], ["2", Decimal("2.1"), 11], ["3", Decimal("120.9"), 12]], ]], ) def test_normal(self, tmpdir, value, type_hints, expected): p_db = tmpdir.join("tmp.db") con = SimpleSQLite(str(p_db), "w") con.create_table_from_tabledata(value) assert con.fetch_table_names() == [value.table_name] assert con.fetch_attr_names(value.table_name) == value.headers actual = con.select_as_tabledata(columns=value.headers, table_name=value.table_name, type_hints=type_hints) assert actual.value_matrix == expected