Ejemplo n.º 1
0
def prompt_dup_key(n_dup_key):

    u.log_print('|')
    bn = u.big_number(n_dup_key)
    s = f"Warning: {bn} different lines with the same research key were identified"
    u.log(s)
    u.log_example(gl.dup_key_list)

    s = ("\nFile comparison may not work correctly. Here are your options:"
         "\na -> save duplicates list and quit"
         "\nb -> quit without saving duplicates list"
         "\nc -> save duplicates list and continue"
         "\nd -> continue without saving duplicates list")
    if gl.TEST_PROMPT_DK:
        u.log_print(s)
        u.log_print('c (TEST_PROMPT_DK = True)')
        command = 'c'
    else:
        command = u.log_input(s)
    u.log_print('|')
    if command == 'a' or command == 'c':
        u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE)
        s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}"
        u.log(s)
    if command == 'a' or command == 'b':
        sys.exit()
Ejemplo n.º 2
0
def left_join_files(lpath='', rpath='', out='', debug=False):
    """Joints two files (lpath and rpath) on the first column of each file"""
    from .init import init_globals
    from .join import left_join_arrays

    u.log("[rl] left_join_files: start")
    start_time = time()
    if debug:
        gl.DEBUG_JOIN = True
    if lpath or rpath:
        init_globals()
        u.log(f"Loading arrays from '{lpath}' and '{rpath}'...")
        gl.ar_in = u.load_csv(lpath)
        ar_right = u.load_csv(rpath)
        u.log("Arrays loaded")
        u.log_print('|')
    else:
        u.log("Loading right arrays...")
        ar_right = u.load_csv(gl.OUT_SQL)
        u.log("Right array loaded")
    left_join_arrays(gl.ar_in, ar_right)
    if not out:
        out = gl.OUT_PATH
    u.log("Saving output file...")
    u.save_csv(gl.out_array, out)
    s = f"Output file saved in {out}"
    u.log(s)
    dstr = u.get_duration_string(start_time)
    u.log(f"[rl] left_join_files: end ({dstr})")
    u.log_print('|')
Ejemplo n.º 3
0
def left_join_arrays(ar_left_in, ar_right_in):

    check_void_right_array(ar_right_in)
    u.log("Preparing left array...")
    (ar_left, first_line_l) = prepare_array(ar_left_in)
    u.save_csv(ar_left, gl.OUT_LEFT)
    log_prepare(gl.OUT_LEFT, u.big_number(len(ar_left)))

    u.log("Preparing right array...")
    (ar_right, first_line_r) = prepare_array(ar_right_in)
    u.save_csv(ar_right, gl.OUT_RIGHT)
    log_prepare(gl.OUT_RIGHT, u.big_number(len(ar_right)))

    u.log("Joining both arrays...")
    init_while_join(first_line_l, first_line_r)
    while gl.END_LEFT is False or gl.END_RIGHT is False:
        (key_l, key_r) = update_key(ar_left, ar_right)
        key_l = compare_inf(key_l, key_r, ar_left)
        (key_l, key_r) = compare_sup(key_l, key_r, ar_left, ar_right)
        key_r = compare_equal(key_l, key_r, ar_left, ar_right)
        if incr_c_l(ar_left):
            break
    bn = u.big_number(len(gl.out_array))
    s = f"Output array generated. It has {bn} lines (including header)."
    u.log(s)
Ejemplo n.º 4
0
def finish(out_path, prompt, nb, start_time):

    n_dup_key = len(gl.dup_key_list)
    n_dup = len(gl.dup_list)
    bn1 = u.big_number(gl.c_tot_out)
    bn2 = u.big_number(n_dup)
    s = (f"Output file {out_path} successfully generated"
         f" ({bn1} lines written, {bn2} pure duplicates removed).")
    u.log(s)
    if n_dup > 0:
        if nb != 0:
            out_dup = gl.OUT_DUP_FILE + str(nb) + gl.FILE_TYPE
        else:
            out_dup = gl.OUT_DUP_FILE + gl.FILE_TYPE
        u.save_csv(gl.dup_list, out_dup)
        u.log(f"Pure duplicates list written in {out_dup}")
        u.log_example(gl.dup_list, "pure duplicates")
    if n_dup_key > 0:
        if prompt:
            prompt_dup_key(n_dup_key)
        else:
            u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE)
            s = f"{n_dup_key} key duplicates found. List written in {gl.OUT_DUP_KEY_FILE}"
            u.log(s)

    dstr = u.get_duration_string(start_time)
    u.log(f"[dq] sort_file: end ({dstr})")
Ejemplo n.º 5
0
def group_by():
    out_path = gl.OUT_PATH
    header = u.get_header(out_path, True)
    vol_fields = [elt for elt in header if is_vol_field(elt)]
    if len(vol_fields) == 0:
        return
    else:
        gl.COUNT = True
        vol_field = vol_fields[0]

    if not gl.MERGE_OK or not gl.range_query:
        return

    u.log('Group by on output file...')

    array_in = u.load_csv(out_path)
    gb_fields = [elt for elt in header if not is_vol_field(elt)]
    if gb_fields:
        import pandas as pd
        df = pd.DataFrame(data=array_in[1:], columns=header)
        df[vol_field] = df[vol_field].astype(int)
        df = df.groupby(by=gb_fields).sum()
        df = df.sort_values(by=vol_field, ascending=False)
        df.to_csv(path_or_buf=gl.OUT_PATH, sep=';', encoding='UTF-8')
    else:
        # if this is a simple count result without group by statement
        # results of different files are directly summed (pandas not needed)
        cur_list = [int(elt[0]) for elt in array_in[1:]]
        out = [array_in[0], [str(sum(cur_list))]]
        u.save_csv(out, gl.OUT_PATH)
    u.log('Group by over')
Ejemplo n.º 6
0
def finish_find_dup(dup_list, out_path, open_out):

    n = len(dup_list)
    if n == 0:
        u.log("No duplicates found")
        return

    bn = u.big_number(len(dup_list))
    u.log(f"{bn} duplicates found")
    u.log_example(dup_list)

    u.save_csv(dup_list, out_path)
    u.log(f"List of duplicates saved in {out_path}")
    if open_out:
        u.startfile(out_path)
Ejemplo n.º 7
0
def test_rl():
    u.init_log('test_rl', True)
    if not ts.is_test_db_defined():
        return

    u.mkdirs(gl.TMP_DIR, True)
    u.mkdirs(ts.gl.TMP_DIR, True)
    u.mkdirs(gl.OUT_DIR, True)
    u.log_print()

    u.log_print('Test join', dashes=100)
    tr.left_join_files(gl.LEFT_1, gl.RIGHT_1, gl.OUT_JOIN_REF_1)
    tr.left_join_files(gl.LEFT_2, gl.RIGHT_2, gl.OUT_JOIN_REF_2)
    tr.left_join_files(gl.LEFT_3, gl.RIGHT_3, gl.OUT_JOIN_REF_3)

    u.log_print('Preparing DB', dashes=100)
    ts.upload(ts.gl.IN)
    arr = u.load_csv(ts.gl.IN)
    arr = [elt[0] for elt in arr]
    u.save_csv(arr, gl.IN_1)

    u.log_print('Test rl - no sql output', dashes=100)
    t.ttry(tr.reqlist, u.g.E_VA, gl.IN_1, gl.OUT_1, gl.QUERY_NO)

    u.log_print('Test rl - no var in query', dashes=100)
    t.ttry(tr.reqlist, u.g.E_MV, gl.IN_1, gl.OUT_1, gl.QUERY_MV)

    u.log_print('Test rl - missing header', dashes=100)
    u.save_csv(arr[1:], gl.IN_MH)
    t.ttry(tr.reqlist, u.g.E_MH, gl.IN_MH, gl.OUT_1, gl.QUERY_1)

    u.log_print('Test rl - standard', dashes=100)
    tr.reqlist(gl.IN_1, gl.OUT_1, gl.QUERY_1, cnx=1)
    tr.reqlist(gl.OUT_1, gl.OUT_2, gl.QUERY_2)
    dq.file_match(ts.gl.IN, gl.OUT_2, del_dup=True)
    dq.file_match(t.gl.OUT_DUP_TMP, gl.OUT_DUP_REF)

    u.log_print('Test rl - interuption and recovery', dashes=100)
    u.mkdirs(gl.TMP_DIR, True)
    u.log_print()
    args = [gl.OUT_1, gl.OUT_3, gl.QUERY_2]
    tr.reqlist_interrupted(*args, cnx=6)
    tr.reqlist(gl.OUT_1, gl.OUT_3, gl.QUERY_2, True, cnx=6)
    dq.file_match(gl.OUT_2, gl.OUT_3)

    ts.clean_db([ts.gl.T_TEST])

    u.check_log(tr.CL)
Ejemplo n.º 8
0
def test_tools():
    u.init_log('test_tools', True)
    u.mkdirs(gl.OUT_DIR, True)
    u.log_print()

    u.log_print("Test tools.xml", dashes=100)
    tt.parse_xml()
    dq.file_match(gl.XML_OUT, gl.XML_OUT_REF)

    u.log_print("Test toolSplit", dashes=100)
    tt.split()

    u.log_print("Test toolDup - to.find_dup simple", dashes=100)
    to.find_dup(gl.DUP_IN, gl.DUP_OUT)
    u.log_print()
    dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF)

    u.log_print("Test toolDup - to.find_dup col", dashes=100)
    to.find_dup(gl.DUP_COL_IN, col=1)
    u.log_print()
    dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF)

    u.log_print("Test toolDup - to.del_dup + shuffle", dashes=100)
    to.shuffle_file(gl.DUP_IN, gl.SHUF_OUT)
    u.log_print()
    to.del_dup(gl.SHUF_OUT, gl.DUP_OUT)
    u.log_print()
    dq.file_match(gl.DUP_OUT, gl.DEL_DUP_OUT_REF)

    u.log_print("Test toolDup - to.find_dup_list", dashes=100)
    list_in = u.load_csv(gl.DUP_IN)
    dup_list = to.find_dup_list(list_in)
    u.save_csv(dup_list, gl.DUP_OUT)
    dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF)

    u.log_print("Test toolFilter", dashes=100)
    tt.flt()

    u.log_print("Test BF", dashes=100)
    tt.read_big_file()
    tt.search_big_file()
    bf.sort_big_file(ts.gl.IN, gl.SORT_BF_OUT)
    dq.file_match(ts.gl.IN, gl.SORT_BF_OUT, del_dup=True)

    u.check_log(tt.CL)
Ejemplo n.º 9
0
def finish(out_path, start_time):

    u.log("Filtering over")
    bn1 = u.big_number(gl.n_r)
    bn2 = u.big_number(gl.n_o)
    s = (f"{bn1} lines read in the input file and"
         f" {bn2} lines to be written in the output file")
    u.log(s)

    u.log("Writing output file...")
    u.save_csv(gl.out_list, out_path)
    s = f"Output file saved in {out_path}"
    u.log(s)
    dstr = u.get_duration_string(start_time)
    u.log(f"[toolFilter] filter: end ({dstr})")
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(out_path)
Ejemplo n.º 10
0
def insert(script):

    if gl.c_chunk >= gl.ref_chunk:
        gl.data = [tuple(line) for line in gl.data]
        gl.c.executemany(script, gl.data)
        gl.c_chunk += 1
        snc = str(gl.c_chunk)
        u.save_csv([f"{snc}_COMMIT_RUNNING"], gl.tmp_file_chunk)
        gl.cnx.commit()
        u.save_csv([snc], gl.tmp_file_chunk)
        sn = u.big_number(gl.c_main)
        u.log(f"{sn} lines inserted in total")
        gl.c.close()
        gl.c = gl.cnx.cursor()
    else:
        gl.c_chunk += 1

    gl.data = []
Ejemplo n.º 11
0
def iutd_db(d_now, cnx):
    d_bdd = get_bdd_date(cnx)
    u.save_csv([d_bdd], gl.iutd_path)
    u.log(f"Check file saved in {gl.iutd_path}")
    compare_dates(d_bdd, d_now)
    gls.iutd = True
Ejemplo n.º 12
0
from partools.rl import reqlist
from partools.quickstart import files_dir

init_log('rl')

db = 'XE'
cnx_str = 'USERNAME/PWD@localhost:1521/XE'

date = datetime.now().strftime("%Y%m%d")
in_file = f"{g.dirs['IN']}rl_in.csv"
out_file = f"{g.dirs['OUT']}export_RL_{db}_{date}.csv"

# Creates input file from test file
arr = u.load_csv(f'{files_dir}in.csv')
arr = [elt[0:2] for elt in arr]
u.save_csv(arr, in_file)

# The input query has to be variabilized ie. contain @@IN@@:
query_in = """
SELECT AFFAIRE, PRM
FROM TEST
WHERE 1=1
AND AFFAIRE IN @@IN@@
"""

reqlist(
    CNX_INFO=cnx_str,
    QUERY_IN=query_in,
    IN_PATH=in_file,
    OUT_PATH=out_file,
)
Ejemplo n.º 13
0
def gen_temp_file():
    # Generating one temporary file

    file_nb = gl.c_file
    tmp_path = f"{gl.TMP_DIR}tmp_{file_nb}{gl.FILE_TYPE}"
    u.save_csv(gl.cur_list, tmp_path)