def prompt_dup_key(n_dup_key): u.log_print('|') bn = u.big_number(n_dup_key) s = f"Warning: {bn} different lines with the same research key were identified" u.log(s) u.log_example(gl.dup_key_list) s = ("\nFile comparison may not work correctly. Here are your options:" "\na -> save duplicates list and quit" "\nb -> quit without saving duplicates list" "\nc -> save duplicates list and continue" "\nd -> continue without saving duplicates list") if gl.TEST_PROMPT_DK: u.log_print(s) u.log_print('c (TEST_PROMPT_DK = True)') command = 'c' else: command = u.log_input(s) u.log_print('|') if command == 'a' or command == 'c': u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE) s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}" u.log(s) if command == 'a' or command == 'b': sys.exit()
def log_prepare(ar, bn_ar): n_dup = len(gl.dup_list) bn_dup = u.big_number(n_dup) s = f"Array prepared and saved in {ar} ({bn_ar} lines, {bn_dup} duplicates dismissed)" u.log(s) u.log_example(gl.dup_list)
def prepare_bdd(): from .execute import execute if gl.EXECUTE_KWARGS: u.log("Preparing DB before data injection...") u.log_print("|") execute(**gl.EXECUTE_KWARGS)
def group_by(): out_path = gl.OUT_PATH header = u.get_header(out_path, True) vol_fields = [elt for elt in header if is_vol_field(elt)] if len(vol_fields) == 0: return else: gl.COUNT = True vol_field = vol_fields[0] if not gl.MERGE_OK or not gl.range_query: return u.log('Group by on output file...') array_in = u.load_csv(out_path) gb_fields = [elt for elt in header if not is_vol_field(elt)] if gb_fields: import pandas as pd df = pd.DataFrame(data=array_in[1:], columns=header) df[vol_field] = df[vol_field].astype(int) df = df.groupby(by=gb_fields).sum() df = df.sort_values(by=vol_field, ascending=False) df.to_csv(path_or_buf=gl.OUT_PATH, sep=';', encoding='UTF-8') else: # if this is a simple count result without group by statement # results of different files are directly summed (pandas not needed) cur_list = [int(elt[0]) for elt in array_in[1:]] out = [array_in[0], [str(sum(cur_list))]] u.save_csv(out, gl.OUT_PATH) u.log('Group by over')
def gen_query_list(): u.log("Building query list to be input in sql.dowload...") gl.query_var = sql.get_query(gl.QUERY_IN) check_var(gl.query_var) u.log_print(f"Base query:\n{gl.query_var}\n;") elt_list = prepare_elt_list(gl.ar_in) n_grp = math.ceil(len(elt_list) / gl.NB_MAX_ELT_IN_STATEMENT) size_elt_list = math.floor(math.log10(n_grp)) + 1 i, n = 0, 0 cur_elt_list, query_list = [], [] for elt in elt_list: cur_elt_list.append(elt) i += 1 if len(cur_elt_list) % gl.NB_MAX_ELT_IN_STATEMENT == 0: n += 1 n_str = u.extend_str(n, '0', size_elt_list, True) grp = gen_group(cur_elt_list) query_list.append([grp, n_str]) cur_elt_list = [] if len(cur_elt_list) > 0: n += 1 n_str = u.extend_str(n, '0', size_elt_list, True) grp = gen_group(cur_elt_list) query_list.append([grp, n_str]) gl.query_list = query_list log_gen_query_list(elt_list, query_list)
def get_cnx_info(): err = False if gl.CNX_INFO: cnx_info = gl.CNX_INFO s = gl.S_1.format(cnx_info) elif (gl.DB, gl.ENV) in cfg.CONF_ORACLE: cnx_info = cfg.CONF_ORACLE[(gl.DB, gl.ENV)] s = gl.S_2.format(gl.DB, gl.ENV, cnx_info) elif gl.DB in cfg.CONF_ORACLE: cnx_info = cfg.CONF_ORACLE[gl.DB] s = gl.S_3.format(gl.DB, cnx_info) elif not gl.DB: s = gl.E_1 err = True elif not gl.ENV and gl.DB not in cfg.CONF_ORACLE: s = gl.E_2.format(gl.DB) err = True else: s = gl.E_3.format(gl.DB, gl.ENV) err = True if err: raise Exception(s) else: u.log(s) return cnx_info
def read_big_file(in_path, **kwargs): """Reads a potentially big file See in partools/tools/gl for other parameters (kwargs) See partools/quickstart/tools_bf.py for examples of use """ from .init import init_rbf u.log("[toolBF] read_big_file: start") init_rbf() u.init_kwargs(gl, kwargs) with open(in_path, 'r', encoding='utf-8', errors='ignore') as in_file: line = f.read_file(in_file) u.log_print(line.strip("\n")) while line != "": line = f.read_file(in_file) u.log_print(line.strip("\n")) gl.c_read += 1 if f.check_counter(in_file): continue else: break u.log("[toolBF] read_big_file: end\n")
def no_auth(mail_name, subject, var_dict=[], attachments=[], HTMLbody='', recipients=[], decrypt_key=''): """Sends emails using a no authentication smtp server See README.md for guidance See partools/quickstart/mail.py for examples of use - attachments: list of absolute path for attached files - var_dict: dictionary of variables to be replaced in HTMLbody - HTMLbody: if not input, mails/mail_name/template.html is taken - recipients: if not input, mails/mail_name/recipients.txt is taken """ from partools import cfg f.init(mail_name, recipients, True) f.init_cfi(decrypt_key) msg = get.msg(subject, HTMLbody, attachments, var_dict) u.log(f"Sending mail '{mail_name}' to {gl.recipients}...") with smtplib.SMTP(cfg.HOST_NO_AUTH) as server: server.sendmail(gl.sender, gl.recipients, msg.as_string()) u.log('Mail sent')
def is_test_db_defined(): if not pt.cfg.CONF_ORACLE: s = "cfg.CONF_ORACLE not defined. Test aborted." u.log(s) warnings.warn(s) return False else: return True
def finish_this(start_time): gl.cnx.close() os.remove(gl.tmp_file_chunk) bn = u.big_number(gl.c_main) dstr = u.get_duration_string(start_time) u.log(f"{bn} lines exported") u.log(f"[sql] upload: end ({dstr})")
def init_cfi(decrypt_key=''): gl.cfi = u.get_confidential(decrypt_key, False) if not gl.cfi: raise Exception(gl.S_MISSING_CFI) u.log(f"Password decrypted: '{gl.cfi['PWD_GMAIL']}'") gl.sender = gl.cfi['MAIL_FROM'] gl.From = gl.cfi['MAIL_FROM']
def check_var(query): var = u.g.VAR_DEL + gl.VAR_IN + u.g.VAR_DEL if var not in query: s = f"Error: query must contain {var}" u.log(s) u.log_print("Query:") u.log_print(query) raise Exception(u.g.E_MV)
def finish_del_dup(out_list, out_path, open_out): u.log(f"Saving list without duplicates in '{out_path}'...") u.save_list(out_list, out_path) bn_out = u.big_number(len(out_list)) u.log(f"List saved, it has {bn_out} lines") if open_out: u.startfile(out_path)
def check_ec(file_list): for elt in file_list: if gl.EC in elt: s = (f"EC file found ({elt})." " Meging of temporary files aborted.") u.log(s) gl.MERGE_OK = False return True return False
def init(kwargs): u.init_kwargs(gl, kwargs) init_globals() u.check_header(gl.IN_PATH) u.log(f"Loading input array from '{gl.IN_PATH}'...") gl.ar_in = u.load_csv(gl.IN_PATH) u.log("Input array loaded") u.log_print('|')
def connect(): init_instant_client() cnx_info = get_cnx_info() cnx = connect_with(cnx_info) u.log("Connected") is_up_to_date(cnx) return cnx
def log_gen_query_list(elt_list, group_list): bn1 = u.big_number(len(elt_list)) bn2 = u.big_number(len(group_list)) s = ( f"Query list built: {bn1} elements to be processed distributed" f" in {bn2} groups ({gl.NB_MAX_ELT_IN_STATEMENT} max per group)." f" They will be processed in parallel by {gl.MAX_DB_CNX} connection pools." ) u.log(s)
def finish_xml(out_path, start_time): dstr = u.get_duration_string(start_time) bn = u.big_number(gl.N_WRITE) s = f"[toolParseXML] parse_xml: end ({bn} lines written in {dstr})" u.log(s) u.log_print() if gl.OPEN_OUT_FILE: u.startfile(out_path)
def inject(): s1 = "Injecting data in DB" if gl.ref_chunk != 0: bn = u.big_number(gl.ref_chunk * gl.NB_MAX_ELT_INSERT) s = s1 + f" (recovering from line {bn})" else: s = s1 s += "..." u.log(s)
def init_dq(kwargs): u.log("[dq] run_dq: start") u.init_kwargs(gl, kwargs) init_tmp_dir() set_paths() s = ( f"run_dq job initialised. Input files {gl.paths['in1']} and {gl.paths['in2']}" " are going to be sorted and compared.") u.log(s) u.log_print('|')
def finish_dq(start_time): (dms, dstr) = u.get_duration_string(start_time, True) s = f"[dq] run_dq: end ({dstr})" u.log(s) if gl.MSG_BOX_END: st.msg_box(s, "dq", dms, gl.MIN_DUR_TRIGGER) u.log_print() if gl.OPEN_OUT_FILE: u.startfile(gl.paths["out"])
def diff_list(list1, list2, out_path): if not out_path: out_path = u.g.dirs['OUT'] + 'file_match_out.csv' out1 = [e for e in list1 if e not in list2] out2 = [e for e in list2 if e not in list1] out = to.del_dup_list(out1 + out2) u.save_list(out, out_path) u.log(f"Comparison result available here: {out_path}")
def finish(out_path): nb_out = u.big_number(gl.c_out) nb_1 = u.big_number(gl.c_1) nb_2 = u.big_number(gl.c_2) s = (f"Output file successfully generated in {out_path}\n" f"\t\t{nb_1} lines read in file 1\n" f"\t\t{nb_2} lines read in file 2\n" f"\t\t{nb_out} lines written in output file") u.log(s)
def init_array_list(): counter = 1 gl.array_list = [[]] while counter < gl.c_file: counter += 1 gl.array_list.append([]) nb = gl.c_row_max s = (f"Buffer array initialised. It can hold a maximum of {nb} lines.") u.log(s)
def ttry(f, e_ref, *args, **kwargs): exception_occured = False try: f(*args, **kwargs) except Exception as e: assert u.like(str(e), e_ref) u.log(f"[ttry] Exception caught match expected ('{e_ref}')") exception_occured = True assert exception_occured
def rewrite_tmp_file(tmp_file_list, tmp_file_path, n_written_rows): # Rewriting tmp file without the lines written in buffer array if len(tmp_file_list) > 0: with open(tmp_file_path, 'w', encoding='utf-8') as tmp_file: for line in tmp_file_list[n_written_rows:]: tmp_file.write(line) else: # If void, tmp file is deleted os.remove(tmp_file_path) u.log(f"Deleting temporary file no. {gl.c_col}")
def is_up_to_date(cnx): if not gl.TEST_IUTD: if gl.DB not in gl.IUTD_LIST or gls.iutd: return u.log(f"IUTD (Is Up To Date) check for DB {gl.DB}") d_now = datetime.now().strftime("%Y/%m/%d") if iutd_file(d_now): return iutd_db(d_now, cnx)
def write_rows_finish(q_name, i, cnx_nb): bn = u.big_number(i) if q_name == 'MONO': return elif gl.MAX_DB_CNX == 1 or cnx_nb == 0: s = f"All lines written for query '{q_name}' ({bn} lines written)" u.log(s) else: s = (f"All lines written for query '{q_name}'" f" ({bn} lines written, connection no. {cnx_nb})") u.log(s)
def compare_headers(in1, in2): line1 = u.get_header(in1) line2 = u.get_header(in2) if line1 != line2: s = (f"Error: files {in1} and {in2} don't have the same header." " Input files must have the same header.") u.log(s) raise Exception(u.g.E_DH) return True
def gen_cnx_dict(nb): init_instant_client() cnx_info = get_cnx_info() gl.cnx_dict = dict() i = 1 while i <= nb: u.log(f'Creating connection no. {i}...') gl.cnx_dict[i] = connect_with(cnx_info) is_up_to_date(gl.cnx_dict[i]) u.log(f'Connection no. {i} created') i += 1