def left_join_files(lpath='', rpath='', out='', debug=False): """Joints two files (lpath and rpath) on the first column of each file""" from .init import init_globals from .join import left_join_arrays u.log("[rl] left_join_files: start") start_time = time() if debug: gl.DEBUG_JOIN = True if lpath or rpath: init_globals() u.log(f"Loading arrays from '{lpath}' and '{rpath}'...") gl.ar_in = u.load_csv(lpath) ar_right = u.load_csv(rpath) u.log("Arrays loaded") u.log_print('|') else: u.log("Loading right arrays...") ar_right = u.load_csv(gl.OUT_SQL) u.log("Right array loaded") left_join_arrays(gl.ar_in, ar_right) if not out: out = gl.OUT_PATH u.log("Saving output file...") u.save_csv(gl.out_array, out) s = f"Output file saved in {out}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[rl] left_join_files: end ({dstr})") u.log_print('|')
def group_by(): out_path = gl.OUT_PATH header = u.get_header(out_path, True) vol_fields = [elt for elt in header if is_vol_field(elt)] if len(vol_fields) == 0: return else: gl.COUNT = True vol_field = vol_fields[0] if not gl.MERGE_OK or not gl.range_query: return u.log('Group by on output file...') array_in = u.load_csv(out_path) gb_fields = [elt for elt in header if not is_vol_field(elt)] if gb_fields: import pandas as pd df = pd.DataFrame(data=array_in[1:], columns=header) df[vol_field] = df[vol_field].astype(int) df = df.groupby(by=gb_fields).sum() df = df.sort_values(by=vol_field, ascending=False) df.to_csv(path_or_buf=gl.OUT_PATH, sep=';', encoding='UTF-8') else: # if this is a simple count result without group by statement # results of different files are directly summed (pandas not needed) cur_list = [int(elt[0]) for elt in array_in[1:]] out = [array_in[0], [str(sum(cur_list))]] u.save_csv(out, gl.OUT_PATH) u.log('Group by over')
def init(kwargs): u.init_kwargs(gl, kwargs) init_globals() u.check_header(gl.IN_PATH) u.log(f"Loading input array from '{gl.IN_PATH}'...") gl.ar_in = u.load_csv(gl.IN_PATH) u.log("Input array loaded") u.log_print('|')
def test_rl(): u.init_log('test_rl', True) if not ts.is_test_db_defined(): return u.mkdirs(gl.TMP_DIR, True) u.mkdirs(ts.gl.TMP_DIR, True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print('Test join', dashes=100) tr.left_join_files(gl.LEFT_1, gl.RIGHT_1, gl.OUT_JOIN_REF_1) tr.left_join_files(gl.LEFT_2, gl.RIGHT_2, gl.OUT_JOIN_REF_2) tr.left_join_files(gl.LEFT_3, gl.RIGHT_3, gl.OUT_JOIN_REF_3) u.log_print('Preparing DB', dashes=100) ts.upload(ts.gl.IN) arr = u.load_csv(ts.gl.IN) arr = [elt[0] for elt in arr] u.save_csv(arr, gl.IN_1) u.log_print('Test rl - no sql output', dashes=100) t.ttry(tr.reqlist, u.g.E_VA, gl.IN_1, gl.OUT_1, gl.QUERY_NO) u.log_print('Test rl - no var in query', dashes=100) t.ttry(tr.reqlist, u.g.E_MV, gl.IN_1, gl.OUT_1, gl.QUERY_MV) u.log_print('Test rl - missing header', dashes=100) u.save_csv(arr[1:], gl.IN_MH) t.ttry(tr.reqlist, u.g.E_MH, gl.IN_MH, gl.OUT_1, gl.QUERY_1) u.log_print('Test rl - standard', dashes=100) tr.reqlist(gl.IN_1, gl.OUT_1, gl.QUERY_1, cnx=1) tr.reqlist(gl.OUT_1, gl.OUT_2, gl.QUERY_2) dq.file_match(ts.gl.IN, gl.OUT_2, del_dup=True) dq.file_match(t.gl.OUT_DUP_TMP, gl.OUT_DUP_REF) u.log_print('Test rl - interuption and recovery', dashes=100) u.mkdirs(gl.TMP_DIR, True) u.log_print() args = [gl.OUT_1, gl.OUT_3, gl.QUERY_2] tr.reqlist_interrupted(*args, cnx=6) tr.reqlist(gl.OUT_1, gl.OUT_3, gl.QUERY_2, True, cnx=6) dq.file_match(gl.OUT_2, gl.OUT_3) ts.clean_db([ts.gl.T_TEST]) u.check_log(tr.CL)
def test_tools(): u.init_log('test_tools', True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print("Test tools.xml", dashes=100) tt.parse_xml() dq.file_match(gl.XML_OUT, gl.XML_OUT_REF) u.log_print("Test toolSplit", dashes=100) tt.split() u.log_print("Test toolDup - to.find_dup simple", dashes=100) to.find_dup(gl.DUP_IN, gl.DUP_OUT) u.log_print() dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolDup - to.find_dup col", dashes=100) to.find_dup(gl.DUP_COL_IN, col=1) u.log_print() dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolDup - to.del_dup + shuffle", dashes=100) to.shuffle_file(gl.DUP_IN, gl.SHUF_OUT) u.log_print() to.del_dup(gl.SHUF_OUT, gl.DUP_OUT) u.log_print() dq.file_match(gl.DUP_OUT, gl.DEL_DUP_OUT_REF) u.log_print("Test toolDup - to.find_dup_list", dashes=100) list_in = u.load_csv(gl.DUP_IN) dup_list = to.find_dup_list(list_in) u.save_csv(dup_list, gl.DUP_OUT) dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolFilter", dashes=100) tt.flt() u.log_print("Test BF", dashes=100) tt.read_big_file() tt.search_big_file() bf.sort_big_file(ts.gl.IN, gl.SORT_BF_OUT) dq.file_match(ts.gl.IN, gl.SORT_BF_OUT, del_dup=True) u.check_log(tt.CL)
def init_find_dup(in_path, out_path, col): if not out_path: tmp_dir = u.g.dirs['TMP'] + gl.TMP_FOLDER u.mkdirs(tmp_dir) out_path = tmp_dir + gl.TMP_OUT s = "Searching duplicates in " if col == 0: u.log(f"{s} file {in_path}") cur_list = u.load_txt(in_path) else: u.log(f"{s}column no. {col} of file {in_path}") cur_list = u.load_csv(in_path) cur_list = [x[col - 1] for x in cur_list] if u.has_header(cur_list): cur_list = cur_list[1:] return (cur_list, out_path)
from partools.utils import g from partools.utils import init_log from partools.rl import reqlist from partools.quickstart import files_dir init_log('rl') db = 'XE' cnx_str = 'USERNAME/PWD@localhost:1521/XE' date = datetime.now().strftime("%Y%m%d") in_file = f"{g.dirs['IN']}rl_in.csv" out_file = f"{g.dirs['OUT']}export_RL_{db}_{date}.csv" # Creates input file from test file arr = u.load_csv(f'{files_dir}in.csv') arr = [elt[0:2] for elt in arr] u.save_csv(arr, in_file) # The input query has to be variabilized ie. contain @@IN@@: query_in = """ SELECT AFFAIRE, PRM FROM TEST WHERE 1=1 AND AFFAIRE IN @@IN@@ """ reqlist( CNX_INFO=cnx_str, QUERY_IN=query_in, IN_PATH=in_file,