def test_save_thor_file_works_when_chunksize_is_zero(self): file_name = "test_save_thor_file_works_when_chunksize_is_zero" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) with self.assertRaises(ZeroDivisionError): save_thor_file(connection=self.conn, thor_file=file_name, chunk_size=0)
def test_save_thor_file_uses_max_sleep(self, mock): mock.return_value = pd.DataFrame({'int': ['1'], '__fileposition__': ['0']}) file_name = "test_save_thor_file_uses_max_sleep" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, False, None ) save_thor_file(self.conn, file_name, max_sleep=120) mock.assert_called_with(file_name, 0, 2, 3, 120)
def test_save_thor_file_uses_custom_max_workers(self, mock): mock.return_value = ThreadPoolExecutor(max_workers=15) file_name = "test_save_thor_file_uses_custom_max_workers" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) save_thor_file(self.conn, file_name, max_workers=2) mock.assert_called_with(max_workers=2)
def test_save_thor_file_chunks_when_num_rows_equal_to_chunksize(self, mock): file_name = ("test_save_thor_file_chunks_when_num_rows_equal_to_chunksize") mock.return_value = pd.DataFrame({'int': ['1'], '__fileposition__': ['0']}) self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) save_thor_file(connection=self.conn, thor_file=file_name, chunk_size=2) mock.assert_called_with(file_name, 0, 2, 3, 60)
def test_save_thor_file_parses_set_types_correctly(self): i = 1 d = 1.5 u = "U'ABC'" s = "'ABC'" b = "TRUE" x = "x'ABC'" es = "ABC" types = [("INTEGER", "int", i), ("INTEGER1", "int1", i), ("UNSIGNED INTEGER", "unsigned_int", i), ("UNSIGNED INTEGER1", "unsigned_int_1", i), ("UNSIGNED8", "is_unsigned_8", i), ("UNSIGNED", "usigned", i), ("DECIMAL10", "dec10", d, float(round(d))), ("DECIMAL5_3", "dec5_3", d), ("UNSIGNED DECIMAL10", "unsigned_dec10", d, float(round(d))), ("UNSIGNED DECIMAL5_3", "unsigned_decl5_3", d), ("UDECIMAL10", "udec10", d, float(round(d))), ("UDECIMAL5_3", "udec5_3", d), ("REAL", "is_real", d), ("REAL4", "is_real4", d), ("UNICODE", "ucode", u, es), ("UNICODE_de", "ucode_de", u, es), ("UNICODE3", "ucode4", u, es), ("UNICODE_de3", "ucode_de4", u, es), ("UTF8", "is_utf8", u, es), ("UTF8_de", "is_utf8_de", u, es), ("STRING", "str", s, es), ("STRING3", "str1", s, es), ("ASCII STRING", "ascii_str", s, es), ("ASCII STRING3", "ascii_str1", s, es), ("EBCDIC STRING", "ebcdic_str", s, es), ("EBCDIC STRING3", "ebcdic_str1", s, es), ("BOOLEAN", "bool", b, True), ("DATA", "is_data", x, "0ABC"), ("DATA3", "is_data_16", x, "0ABC00"), ("VARUNICODE", "varucode", u, es), ("VARUNICODE_de", "varucode_de", u, es), ("VARUNICODE3", "varucode4", u, es), ("VARUNICODE_de3", "varucode_de4", u, es), ("VARSTRING", "varstr", u, es), ("VARSTRING3", "varstr3", u, es), ("QSTRING", "qstr", s, es), ("QSTRING3", "qstr8", s, es)] for t in types: file_name = ("test_save_thor_file_parses_set_types_" "correctly_{}").format(t[1]) s = ("a := DATASET([{{[{}]}}], {{SET OF {} {};}}); " "OUTPUT(a,,'~{}');").format(t[2], t[0], t[1], file_name) self.conn.run_ecl_string(s, True, False, None) try: expected_val = t[3] except IndexError: expected_val = t[2] a = save_thor_file(connection=self.conn, thor_file=file_name, dtype=None) expected = pd.DataFrame( {t[1]: [[expected_val]], "__fileposition__": 0}, index=[0]) self.assertEqual(expected.to_csv(), a)
def test_save_thor_file_uses_default_chunk_size(self, mock): file_name = "test_save_thor_file_uses_default_chunk_size" mock.return_value = pd.DataFrame({'int': ['1'], '__fileposition__': ['0']}) self.conn.run_ecl_string( "a := DATASET([{}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(",".join(["{1}"] * 300000), file_name), True, True, None ) save_thor_file(connection=self.conn, thor_file=file_name, max_workers=2) expected = [ unittest.mock.call(file_name, 0, 150000, 3, 60), unittest.mock.call(file_name, 150000, 150000, 3, 60) ] self.assertEqual(expected, mock.call_args_list)
def test_save_thor_file_returns_a_set(self): file_name = "test_save_thor_file_returns_a_set" s = ("a := DATASET([{{[1, 2, 3]}}], {{SET OF INTEGER set;}}); " "OUTPUT(a,,'~{}');").format(file_name) self.conn.run_ecl_string(s, True, True, None) res = save_thor_file(self.conn, file_name) expected = pd.DataFrame({"set": [[1, 2, 3]], "__fileposition__": 0}) self.assertEqual(expected.to_csv(), res)
def test_save_thor_file_chunks_when_num_rows_greater_than_chunksize( self, mock): file_name = "test_save_thor_file_chunks_when_num_rows_greater_than_chunksize" mock.return_value = pd.DataFrame({'int': ['1'], '__fileposition__': ['0']}) self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) save_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1) expected = [ unittest.mock.call(file_name, 0, 1, 3, 60), unittest.mock.call(file_name, 1, 1, 3, 60) ] self.assertEqual(expected, mock.call_args_list)
def test_save_thor_file_returns_empty_dataset(self): self.conn.run_ecl_string( "a := DATASET([], {INTEGER int;}); " "OUTPUT(a,,'~test_save_thor_file_returns_empty_dataset');", True, True, None ) res = save_thor_file(connection=self.conn, thor_file="test_save_thor_file_returns_empty_dataset") expected = pd.DataFrame(columns=["int", "__fileposition__"]) self.assertEqual(expected.to_csv(), res)
def test_save_thor_file_uses_single_dtype(self): file_name = "test_save_thor_file_uses_single_dtype" self.conn.run_ecl_string( "a := DATASET([{{'1'}}, {{'2'}}], {{STRING int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) res = save_thor_file(self.conn, file_name, dtype=int) expected = pd.DataFrame({"int": [1, 2], "__fileposition__": [0, 5]}, dtype=np.int32) self.assertEqual(expected.to_csv(), res)
def test_save_thor_file_uses_dict_of_dtypes_with_extra_cols_raises(self): file_name = "test_save_thor_file_uses_dict_of_dtypes_with_extra_cols" self.conn.run_ecl_string( "a := DATASET([{{'1', TRUE, 1}}, {{'2', FALSE, 2}}], " "{{STRING str; BOOLEAN bool; INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) with self.assertRaises(KeyError): res = save_thor_file(self.conn, file_name, dtype={"bool": bool, "int": str, "made_up": str})
def test_save_thor_file_works_when_num_rows_less_than_chunksize(self): file_name = ("test_save_thor_file_works_when_num_rows_less_than_" "chunksize") self.conn.run_ecl_string( "a := DATASET([{{1}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) res = save_thor_file(connection=self.conn, thor_file=file_name, chunk_size=2) expected = pd.DataFrame({"int": [1], "__fileposition__": [0]}, dtype=np.int32) self.assertEqual(expected.to_csv(), res)
def test_save_thor_file_works_when_num_rows_greater_than_chunksize(self): file_name = ("test_save_thor_file_works_when_num_rows_greater_than_" "chunksize") self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) res = save_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1, index=False) res = pd.read_csv(StringIO(res)).sort_values('int').reset_index(drop=True) expected = pd.DataFrame({"int": [2, 1], "__fileposition__": [8, 0]}, dtype=np.int32) expected = expected.sort_values('int').sort_values('int').reset_index(drop=True) pd.testing.assert_frame_equal(expected, res, check_dtype=False)
def test_save_thor_file_returns_100_row_dataset(self): lots_of_1s = "[" + ",".join(["{1}"] * 100) + "]" self.conn.run_ecl_string( "a := DATASET({}, {{INTEGER int;}}); " "OUTPUT(a,,'~test_save_thor_file_returns_100_row_dataset');".format( lots_of_1s), True, True, None ) res = save_thor_file(connection=self.conn, thor_file="test_save_thor_file_returns_100_row_dataset") expected = pd.DataFrame({ "int": [1]*100, "__fileposition__": [i*8 for i in range(100)] }, dtype=np.int64).to_csv() self.assertEqual(expected, res)
def test_save_thor_file_uses_dict_of_dtypes_with_missing_cols(self): file_name = "test_save_thor_file_uses_dict_of_dtypes_with_missing__cols" self.conn.run_ecl_string( "a := DATASET([{{'1', TRUE, 1}}, {{'2', FALSE, 2}}], " "{{STRING str; BOOLEAN bool; INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None ) res = save_thor_file(self.conn, file_name, dtype={"bool": bool, "int": str}) expected = pd.DataFrame({ "str": ["1", "2"], "bool": [True, False], "int": ["1", "2"], "__fileposition__": [0, 14]}) self.assertEqual(expected.to_csv(), res)
def _get_a_save(connection, thor_file, path_or_buf=None, max_workers=15, chunk_size=10000, max_attempts=3, max_sleep=10, dtype=None, **kwargs): return save_thor_file(connection, thor_file, path_or_buf, max_workers=max_workers, chunk_size=chunk_size, max_attempts=max_attempts, max_sleep=max_sleep, dtype=dtype, **kwargs)