def prompt_dup_key(n_dup_key): u.log_print('|') bn = u.big_number(n_dup_key) s = f"Warning: {bn} different lines with the same research key were identified" u.log(s) u.log_example(gl.dup_key_list) s = ("\nFile comparison may not work correctly. Here are your options:" "\na -> save duplicates list and quit" "\nb -> quit without saving duplicates list" "\nc -> save duplicates list and continue" "\nd -> continue without saving duplicates list") if gl.TEST_PROMPT_DK: u.log_print(s) u.log_print('c (TEST_PROMPT_DK = True)') command = 'c' else: command = u.log_input(s) u.log_print('|') if command == 'a' or command == 'c': u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE) s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}" u.log(s) if command == 'a' or command == 'b': sys.exit()
def left_join_files(lpath='', rpath='', out='', debug=False): """Joints two files (lpath and rpath) on the first column of each file""" from .init import init_globals from .join import left_join_arrays u.log("[rl] left_join_files: start") start_time = time() if debug: gl.DEBUG_JOIN = True if lpath or rpath: init_globals() u.log(f"Loading arrays from '{lpath}' and '{rpath}'...") gl.ar_in = u.load_csv(lpath) ar_right = u.load_csv(rpath) u.log("Arrays loaded") u.log_print('|') else: u.log("Loading right arrays...") ar_right = u.load_csv(gl.OUT_SQL) u.log("Right array loaded") left_join_arrays(gl.ar_in, ar_right) if not out: out = gl.OUT_PATH u.log("Saving output file...") u.save_csv(gl.out_array, out) s = f"Output file saved in {out}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[rl] left_join_files: end ({dstr})") u.log_print('|')
def left_join_arrays(ar_left_in, ar_right_in): check_void_right_array(ar_right_in) u.log("Preparing left array...") (ar_left, first_line_l) = prepare_array(ar_left_in) u.save_csv(ar_left, gl.OUT_LEFT) log_prepare(gl.OUT_LEFT, u.big_number(len(ar_left))) u.log("Preparing right array...") (ar_right, first_line_r) = prepare_array(ar_right_in) u.save_csv(ar_right, gl.OUT_RIGHT) log_prepare(gl.OUT_RIGHT, u.big_number(len(ar_right))) u.log("Joining both arrays...") init_while_join(first_line_l, first_line_r) while gl.END_LEFT is False or gl.END_RIGHT is False: (key_l, key_r) = update_key(ar_left, ar_right) key_l = compare_inf(key_l, key_r, ar_left) (key_l, key_r) = compare_sup(key_l, key_r, ar_left, ar_right) key_r = compare_equal(key_l, key_r, ar_left, ar_right) if incr_c_l(ar_left): break bn = u.big_number(len(gl.out_array)) s = f"Output array generated. It has {bn} lines (including header)." u.log(s)
def finish(out_path, prompt, nb, start_time): n_dup_key = len(gl.dup_key_list) n_dup = len(gl.dup_list) bn1 = u.big_number(gl.c_tot_out) bn2 = u.big_number(n_dup) s = (f"Output file {out_path} successfully generated" f" ({bn1} lines written, {bn2} pure duplicates removed).") u.log(s) if n_dup > 0: if nb != 0: out_dup = gl.OUT_DUP_FILE + str(nb) + gl.FILE_TYPE else: out_dup = gl.OUT_DUP_FILE + gl.FILE_TYPE u.save_csv(gl.dup_list, out_dup) u.log(f"Pure duplicates list written in {out_dup}") u.log_example(gl.dup_list, "pure duplicates") if n_dup_key > 0: if prompt: prompt_dup_key(n_dup_key) else: u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE) s = f"{n_dup_key} key duplicates found. List written in {gl.OUT_DUP_KEY_FILE}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[dq] sort_file: end ({dstr})")
def group_by(): out_path = gl.OUT_PATH header = u.get_header(out_path, True) vol_fields = [elt for elt in header if is_vol_field(elt)] if len(vol_fields) == 0: return else: gl.COUNT = True vol_field = vol_fields[0] if not gl.MERGE_OK or not gl.range_query: return u.log('Group by on output file...') array_in = u.load_csv(out_path) gb_fields = [elt for elt in header if not is_vol_field(elt)] if gb_fields: import pandas as pd df = pd.DataFrame(data=array_in[1:], columns=header) df[vol_field] = df[vol_field].astype(int) df = df.groupby(by=gb_fields).sum() df = df.sort_values(by=vol_field, ascending=False) df.to_csv(path_or_buf=gl.OUT_PATH, sep=';', encoding='UTF-8') else: # if this is a simple count result without group by statement # results of different files are directly summed (pandas not needed) cur_list = [int(elt[0]) for elt in array_in[1:]] out = [array_in[0], [str(sum(cur_list))]] u.save_csv(out, gl.OUT_PATH) u.log('Group by over')
def finish_find_dup(dup_list, out_path, open_out): n = len(dup_list) if n == 0: u.log("No duplicates found") return bn = u.big_number(len(dup_list)) u.log(f"{bn} duplicates found") u.log_example(dup_list) u.save_csv(dup_list, out_path) u.log(f"List of duplicates saved in {out_path}") if open_out: u.startfile(out_path)
def test_rl(): u.init_log('test_rl', True) if not ts.is_test_db_defined(): return u.mkdirs(gl.TMP_DIR, True) u.mkdirs(ts.gl.TMP_DIR, True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print('Test join', dashes=100) tr.left_join_files(gl.LEFT_1, gl.RIGHT_1, gl.OUT_JOIN_REF_1) tr.left_join_files(gl.LEFT_2, gl.RIGHT_2, gl.OUT_JOIN_REF_2) tr.left_join_files(gl.LEFT_3, gl.RIGHT_3, gl.OUT_JOIN_REF_3) u.log_print('Preparing DB', dashes=100) ts.upload(ts.gl.IN) arr = u.load_csv(ts.gl.IN) arr = [elt[0] for elt in arr] u.save_csv(arr, gl.IN_1) u.log_print('Test rl - no sql output', dashes=100) t.ttry(tr.reqlist, u.g.E_VA, gl.IN_1, gl.OUT_1, gl.QUERY_NO) u.log_print('Test rl - no var in query', dashes=100) t.ttry(tr.reqlist, u.g.E_MV, gl.IN_1, gl.OUT_1, gl.QUERY_MV) u.log_print('Test rl - missing header', dashes=100) u.save_csv(arr[1:], gl.IN_MH) t.ttry(tr.reqlist, u.g.E_MH, gl.IN_MH, gl.OUT_1, gl.QUERY_1) u.log_print('Test rl - standard', dashes=100) tr.reqlist(gl.IN_1, gl.OUT_1, gl.QUERY_1, cnx=1) tr.reqlist(gl.OUT_1, gl.OUT_2, gl.QUERY_2) dq.file_match(ts.gl.IN, gl.OUT_2, del_dup=True) dq.file_match(t.gl.OUT_DUP_TMP, gl.OUT_DUP_REF) u.log_print('Test rl - interuption and recovery', dashes=100) u.mkdirs(gl.TMP_DIR, True) u.log_print() args = [gl.OUT_1, gl.OUT_3, gl.QUERY_2] tr.reqlist_interrupted(*args, cnx=6) tr.reqlist(gl.OUT_1, gl.OUT_3, gl.QUERY_2, True, cnx=6) dq.file_match(gl.OUT_2, gl.OUT_3) ts.clean_db([ts.gl.T_TEST]) u.check_log(tr.CL)
def test_tools(): u.init_log('test_tools', True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print("Test tools.xml", dashes=100) tt.parse_xml() dq.file_match(gl.XML_OUT, gl.XML_OUT_REF) u.log_print("Test toolSplit", dashes=100) tt.split() u.log_print("Test toolDup - to.find_dup simple", dashes=100) to.find_dup(gl.DUP_IN, gl.DUP_OUT) u.log_print() dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolDup - to.find_dup col", dashes=100) to.find_dup(gl.DUP_COL_IN, col=1) u.log_print() dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolDup - to.del_dup + shuffle", dashes=100) to.shuffle_file(gl.DUP_IN, gl.SHUF_OUT) u.log_print() to.del_dup(gl.SHUF_OUT, gl.DUP_OUT) u.log_print() dq.file_match(gl.DUP_OUT, gl.DEL_DUP_OUT_REF) u.log_print("Test toolDup - to.find_dup_list", dashes=100) list_in = u.load_csv(gl.DUP_IN) dup_list = to.find_dup_list(list_in) u.save_csv(dup_list, gl.DUP_OUT) dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolFilter", dashes=100) tt.flt() u.log_print("Test BF", dashes=100) tt.read_big_file() tt.search_big_file() bf.sort_big_file(ts.gl.IN, gl.SORT_BF_OUT) dq.file_match(ts.gl.IN, gl.SORT_BF_OUT, del_dup=True) u.check_log(tt.CL)
def finish(out_path, start_time): u.log("Filtering over") bn1 = u.big_number(gl.n_r) bn2 = u.big_number(gl.n_o) s = (f"{bn1} lines read in the input file and" f" {bn2} lines to be written in the output file") u.log(s) u.log("Writing output file...") u.save_csv(gl.out_list, out_path) s = f"Output file saved in {out_path}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[toolFilter] filter: end ({dstr})") u.log_print() if gl.OPEN_OUT_FILE: u.startfile(out_path)
def insert(script): if gl.c_chunk >= gl.ref_chunk: gl.data = [tuple(line) for line in gl.data] gl.c.executemany(script, gl.data) gl.c_chunk += 1 snc = str(gl.c_chunk) u.save_csv([f"{snc}_COMMIT_RUNNING"], gl.tmp_file_chunk) gl.cnx.commit() u.save_csv([snc], gl.tmp_file_chunk) sn = u.big_number(gl.c_main) u.log(f"{sn} lines inserted in total") gl.c.close() gl.c = gl.cnx.cursor() else: gl.c_chunk += 1 gl.data = []
def iutd_db(d_now, cnx): d_bdd = get_bdd_date(cnx) u.save_csv([d_bdd], gl.iutd_path) u.log(f"Check file saved in {gl.iutd_path}") compare_dates(d_bdd, d_now) gls.iutd = True
from partools.rl import reqlist from partools.quickstart import files_dir init_log('rl') db = 'XE' cnx_str = 'USERNAME/PWD@localhost:1521/XE' date = datetime.now().strftime("%Y%m%d") in_file = f"{g.dirs['IN']}rl_in.csv" out_file = f"{g.dirs['OUT']}export_RL_{db}_{date}.csv" # Creates input file from test file arr = u.load_csv(f'{files_dir}in.csv') arr = [elt[0:2] for elt in arr] u.save_csv(arr, in_file) # The input query has to be variabilized ie. contain @@IN@@: query_in = """ SELECT AFFAIRE, PRM FROM TEST WHERE 1=1 AND AFFAIRE IN @@IN@@ """ reqlist( CNX_INFO=cnx_str, QUERY_IN=query_in, IN_PATH=in_file, OUT_PATH=out_file, )
def gen_temp_file(): # Generating one temporary file file_nb = gl.c_file tmp_path = f"{gl.TMP_DIR}tmp_{file_nb}{gl.FILE_TYPE}" u.save_csv(gl.cur_list, tmp_path)