Ejemplo n.º 1
0
def left_join_arrays(ar_left_in, ar_right_in):

    check_void_right_array(ar_right_in)
    u.log("Preparing left array...")
    (ar_left, first_line_l) = prepare_array(ar_left_in)
    u.save_csv(ar_left, gl.OUT_LEFT)
    log_prepare(gl.OUT_LEFT, u.big_number(len(ar_left)))

    u.log("Preparing right array...")
    (ar_right, first_line_r) = prepare_array(ar_right_in)
    u.save_csv(ar_right, gl.OUT_RIGHT)
    log_prepare(gl.OUT_RIGHT, u.big_number(len(ar_right)))

    u.log("Joining both arrays...")
    init_while_join(first_line_l, first_line_r)
    while gl.END_LEFT is False or gl.END_RIGHT is False:
        (key_l, key_r) = update_key(ar_left, ar_right)
        key_l = compare_inf(key_l, key_r, ar_left)
        (key_l, key_r) = compare_sup(key_l, key_r, ar_left, ar_right)
        key_r = compare_equal(key_l, key_r, ar_left, ar_right)
        if incr_c_l(ar_left):
            break
    bn = u.big_number(len(gl.out_array))
    s = f"Output array generated. It has {bn} lines (including header)."
    u.log(s)
Ejemplo n.º 2
0
def finish(out_path, prompt, nb, start_time):

    n_dup_key = len(gl.dup_key_list)
    n_dup = len(gl.dup_list)
    bn1 = u.big_number(gl.c_tot_out)
    bn2 = u.big_number(n_dup)
    s = (f"Output file {out_path} successfully generated"
         f" ({bn1} lines written, {bn2} pure duplicates removed).")
    u.log(s)
    if n_dup > 0:
        if nb != 0:
            out_dup = gl.OUT_DUP_FILE + str(nb) + gl.FILE_TYPE
        else:
            out_dup = gl.OUT_DUP_FILE + gl.FILE_TYPE
        u.save_csv(gl.dup_list, out_dup)
        u.log(f"Pure duplicates list written in {out_dup}")
        u.log_example(gl.dup_list, "pure duplicates")
    if n_dup_key > 0:
        if prompt:
            prompt_dup_key(n_dup_key)
        else:
            u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE)
            s = f"{n_dup_key} key duplicates found. List written in {gl.OUT_DUP_KEY_FILE}"
            u.log(s)

    dstr = u.get_duration_string(start_time)
    u.log(f"[dq] sort_file: end ({dstr})")
Ejemplo n.º 3
0
def gen_last_file(out_path):
    # Generation of the last temporary file

    gl.c_file += 1
    if gl.c_file == 1:
        bn = u.big_number(gl.c_sf_read)
        s = (f"Input file entirely read ({bn} lines)."
             " Sorting current list...")
        u.log(s)
        gl.cur_list.sort()
        s = "Current list sorted. Generating output file..."
        u.log(s)
        gen_out_file(out_path)
        s = f"Output file saved in {out_path}"
        u.log(s)
    else:
        if len(gl.cur_list) > 0:
            s = ("Input file entirely read ({} lines)."
                 " Sorting last current list...")
            u.log(s.format(u.big_number(gl.c_sf_read)))
            gl.cur_list.sort()
            s = ("Last current list sorted. Generating last temporary file"
                 f" (no. {gl.c_file})...")
            u.log(s.format())
            gen_temp_file()
            s = "Temporary file successfully generated"
            u.log(s)
        else:
            gl.c_file -= 1
        u.log(f"{gl.c_file} temporary files created")
Ejemplo n.º 4
0
def log_gen_query_list(elt_list, group_list):
    bn1 = u.big_number(len(elt_list))
    bn2 = u.big_number(len(group_list))
    s = (
        f"Query list built: {bn1} elements to be processed distributed"
        f" in {bn2} groups ({gl.NB_MAX_ELT_IN_STATEMENT} max per group)."
        f" They will be processed in parallel by {gl.MAX_DB_CNX} connection pools."
    )
    u.log(s)
Ejemplo n.º 5
0
def finish(out_path):

    nb_out = u.big_number(gl.c_out)
    nb_1 = u.big_number(gl.c_1)
    nb_2 = u.big_number(gl.c_2)
    s = (f"Output file successfully generated in {out_path}\n"
         f"\t\t{nb_1} lines read in file 1\n"
         f"\t\t{nb_2} lines read in file 2\n"
         f"\t\t{nb_out} lines written in output file")
    u.log(s)
Ejemplo n.º 6
0
def found_msg(i, j):

    gl.c_row = i
    bni = u.big_number(i)
    bn = u.big_number(gl.c_main)
    if gl.LINE_PER_LINE:
        s = (f"String found in line no. {bni} of list no. {gl.c_list}"
             f" (global line no. {bn}) in col {j + 1}!")
    else:
        s = (f"String found in buffer no. {bn}"
             f" (buffer list no. {gl.c_list}) in col {j + 1}!")
    u.log(s)
Ejemplo n.º 7
0
def prompt_dup_key(n_dup_key):

    u.log_print('|')
    bn = u.big_number(n_dup_key)
    s = f"Warning: {bn} different lines with the same research key were identified"
    u.log(s)
    u.log_example(gl.dup_key_list)

    s = ("\nFile comparison may not work correctly. Here are your options:"
         "\na -> save duplicates list and quit"
         "\nb -> quit without saving duplicates list"
         "\nc -> save duplicates list and continue"
         "\nd -> continue without saving duplicates list")
    if gl.TEST_PROMPT_DK:
        u.log_print(s)
        u.log_print('c (TEST_PROMPT_DK = True)')
        command = 'c'
    else:
        command = u.log_input(s)
    u.log_print('|')
    if command == 'a' or command == 'c':
        u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE)
        s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}"
        u.log(s)
    if command == 'a' or command == 'b':
        sys.exit()
Ejemplo n.º 8
0
def log_prepare(ar, bn_ar):

    n_dup = len(gl.dup_list)
    bn_dup = u.big_number(n_dup)
    s = f"Array prepared and saved in {ar} ({bn_ar} lines, {bn_dup} duplicates dismissed)"
    u.log(s)
    u.log_example(gl.dup_list)
Ejemplo n.º 9
0
def finish_del_dup(out_list, out_path, open_out):

    u.log(f"Saving list without duplicates in '{out_path}'...")
    u.save_list(out_list, out_path)
    bn_out = u.big_number(len(out_list))
    u.log(f"List saved, it has {bn_out} lines")
    if open_out:
        u.startfile(out_path)
Ejemplo n.º 10
0
def finish_this(start_time):

    gl.cnx.close()
    os.remove(gl.tmp_file_chunk)
    bn = u.big_number(gl.c_main)
    dstr = u.get_duration_string(start_time)
    u.log(f"{bn} lines exported")
    u.log(f"[sql] upload: end ({dstr})")
Ejemplo n.º 11
0
def finish(out_path, start_time):

    u.log("Filtering over")
    bn1 = u.big_number(gl.n_r)
    bn2 = u.big_number(gl.n_o)
    s = (f"{bn1} lines read in the input file and"
         f" {bn2} lines to be written in the output file")
    u.log(s)

    u.log("Writing output file...")
    u.save_csv(gl.out_list, out_path)
    s = f"Output file saved in {out_path}"
    u.log(s)
    dstr = u.get_duration_string(start_time)
    u.log(f"[toolFilter] filter: end ({dstr})")
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(out_path)
Ejemplo n.º 12
0
def inject():
    s1 = "Injecting data in DB"
    if gl.ref_chunk != 0:
        bn = u.big_number(gl.ref_chunk * gl.NB_MAX_ELT_INSERT)
        s = s1 + f" (recovering from line {bn})"
    else:
        s = s1
    s += "..."
    u.log(s)
Ejemplo n.º 13
0
def finish_xml(out_path, start_time):

    dstr = u.get_duration_string(start_time)
    bn = u.big_number(gl.N_WRITE)
    s = f"[toolParseXML] parse_xml: end ({bn} lines written in {dstr})"
    u.log(s)
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(out_path)
Ejemplo n.º 14
0
def write_rows_finish(q_name, i, cnx_nb):
    bn = u.big_number(i)
    if q_name == 'MONO':
        return
    elif gl.MAX_DB_CNX == 1 or cnx_nb == 0:
        s = f"All lines written for query '{q_name}' ({bn} lines written)"
        u.log(s)
    else:
        s = (f"All lines written for query '{q_name}'"
             f" ({bn} lines written, connection no. {cnx_nb})")
        u.log(s)
Ejemplo n.º 15
0
def finish_find_dup(dup_list, out_path, open_out):

    n = len(dup_list)
    if n == 0:
        u.log("No duplicates found")
        return

    bn = u.big_number(len(dup_list))
    u.log(f"{bn} duplicates found")
    u.log_example(dup_list)

    u.save_csv(dup_list, out_path)
    u.log(f"List of duplicates saved in {out_path}")
    if open_out:
        u.startfile(out_path)
Ejemplo n.º 16
0
def del_dup(in_path, out_path, open_out=False):
    """Deletes the duplicates in in_path file"""
    from .finish import finish_del_dup

    u.log("[toolDup] del_dup: start")
    u.log(f"Deleting duplicates in file '{in_path}'...")
    cur_list = u.load_txt(in_path)
    bn = u.big_number(len(cur_list))
    u.log(f"File loaded, {bn} lines to be analysed")
    if u.has_header(cur_list):
        out_list = [cur_list[0]] + del_dup_list(cur_list[1:])
    else:
        out_list = del_dup_list(cur_list)
    finish_del_dup(out_list, out_path, open_out)
    u.log("[toolDup] del_dup: end")
Ejemplo n.º 17
0
def find_dup(in_path, out_path='', open_out=False, col=0):
    """Finds the duplicates in in_path file

    - col: if the file is a csv, the duplicates will be searched in this column index
    """
    from .init import init_find_dup
    from .finish import finish_find_dup

    u.log("[toolDup] find_dup: start")
    (cur_list, out_path) = init_find_dup(in_path, out_path, col)
    bn = u.big_number(len(cur_list))
    u.log(f"File loaded, {bn} lines to be analysed")
    dup_list = find_dup_list(cur_list)
    finish_find_dup(dup_list, out_path, open_out)
    u.log("[toolDup] find_dup: end")
Ejemplo n.º 18
0
def search_cur_list():

    s = f"Temp list no. {gl.c_list} search"
    u.log(s, 1)
    i = 0
    for elt in gl.cur_list:
        i += 1
        gl.c_main += 1
        j = elt.find(gl.LOOK_FOR)
        if j != -1:
            found_msg(i, j)
            gl.FOUND = True
            return True
    bn = u.big_number(gl.c_main)
    s = (f"Temp list no. {gl.c_list} search over, string not found"
         f" ({bn} lines read in total)")
    u.log(s, 1)
    return False
Ejemplo n.º 19
0
def goto_eof(in_file):

    cur_list = []
    line = read_file(in_file)
    cur_list.append(line)
    while line != "":
        line = read_file(in_file)
        cur_list.append(line.strip("\n"))
        if len(cur_list) > gl.N_READ + 1:
            del cur_list[0]

    u.log_array(cur_list)
    bn = u.big_number(gl.c_main - 1)
    if gl.LINE_PER_LINE:
        s = f"EOF reached. {bn} lines read."
    else:
        s = f"EOF reached. {bn} buffers of {gl.BUFFER_SIZE} characters read."
    u.log(s)
Ejemplo n.º 20
0
def insert(script):

    if gl.c_chunk >= gl.ref_chunk:
        gl.data = [tuple(line) for line in gl.data]
        gl.c.executemany(script, gl.data)
        gl.c_chunk += 1
        snc = str(gl.c_chunk)
        u.save_csv([f"{snc}_COMMIT_RUNNING"], gl.tmp_file_chunk)
        gl.cnx.commit()
        u.save_csv([snc], gl.tmp_file_chunk)
        sn = u.big_number(gl.c_main)
        u.log(f"{sn} lines inserted in total")
        gl.c.close()
        gl.c = gl.cnx.cursor()
    else:
        gl.c_chunk += 1

    gl.data = []
Ejemplo n.º 21
0
def compare_files(in_1, in_2, out_path):
    from .csf import compare_sorted_files

    u.log("[dq] compare_files: start")
    start_time = time()
    u.gen_header(in_1, gl.COMPARE_FIELD, out_path)
    compare_sorted_files(in_1, in_2, out_path)

    if gl.c_diff == 0:
        u.log("Files match")
        out = True
    else:
        bn = u.big_number(gl.c_diff)
        u.log(f"{bn} differences found")
        out = False

    dstr = u.get_duration_string(start_time)
    u.log(f"[dq] compare_files: end ({dstr})")

    return out
Ejemplo n.º 22
0
def split_needed():
    n_line = gl.c_out
    n_out_files = ceil(n_line / gl.MAX_LINE_SPLIT)
    if n_out_files == 1:
        return False

    n_line_2 = n_line + n_out_files - 1
    n_out_files = ceil(n_line_2 / gl.MAX_LINE_SPLIT)
    bn = u.big_number(gl.MAX_LINE_SPLIT)
    s = (f"Input file has more than {bn} lines."
         f" It will be split in {n_out_files} files "
         f"(max file nb set to {gl.MAX_FILE_NB_SPLIT}). Continue? (y/n)")
    if gl.TEST_PROMPT_SPLIT:
        u.log(s)
        u.log_print('y (TEST_PROMPT_SPLIT = True)')
        return True
    if u.log_input(s) == "n":
        sys.exit()

    return True
Ejemplo n.º 23
0
def init_equal_diff_bool():

    if gl.EQUAL_OUT:
        if gl.c_sf_read <= gl.MAX_ROW_EQUAL_OUT:
            gl.EQUAL = True
            gl.DIFF = gl.DIFF_OUT
        else:
            bn = u.big_number(gl.MAX_ROW_EQUAL_OUT)
            s = (f"Warning: file to be compared have more than {bn} lines"
                 " and EQUAL_OUT paramter is set to True.\n"
                 "Do you want to write matching lines in output file ? (y/n)")
            if u.log_input(s) == "y":
                gl.EQUAL = True
                gl.DIFF = gl.DIFF_OUT
            else:
                gl.EQUAL = False
                gl.DIFF = True
    else:
        gl.EQUAL = False
        gl.DIFF = True
Ejemplo n.º 24
0
def finish_sbf(out_path, start_time):

    if gl.FOUND:
        lowI = gl.c_row - 1 - gl.PRINT_SIZE // 2
        if lowI < 0:
            lowI = 0
        highI = gl.c_row - 1 + gl.PRINT_SIZE // 2
        u.save_list(gl.cur_list[lowI:highI], out_path)
        s = f"Current list written in {out_path}"
        u.log(s.format())
        if gl.OPEN_OUT_FILE:
            u.startfile(out_path)
    else:
        bn = u.big_number(gl.c_main)
        s = (f"EOF reached ({bn} lines, {gl.c_list} temporary lists)"
             f", string '{gl.LOOK_FOR}' not found")
        u.log(s)

    dstr = u.get_duration_string(start_time)
    u.log(f"[toolBF] search_big_file: end ({dstr})\n")
Ejemplo n.º 25
0
def check_max_row(counter):
    # It is checked whether max number of lines of cur_list is not more than
    # fixed limit in module (MAX_ROW_LIST) gl to avoid a memory error

    if counter % gl.MAX_ROW_LIST == 0:
        gl.c_file += 1
        bn = u.big_number(gl.MAX_ROW_LIST)
        list_nb = gl.c_file
        s = (f"Maximum number of lines reached ({bn} lines) for list"
             f" no. {list_nb}, sorting...")
        u.log(s)
        gl.cur_list.sort()
        tmp_nb = gl.c_file
        s = ("Current list sorted. Generating temporary file"
             f" no. {tmp_nb}...")
        u.log(s.format())
        gen_temp_file()
        s = "Temporary file successfully generated, input file reading goes on..."
        u.log(s)
        del gl.cur_list
        gl.cur_list = []