Ejemplo n.º 1
0
def left_join_files(lpath='', rpath='', out='', debug=False):
    """Joints two files (lpath and rpath) on the first column of each file"""
    from .init import init_globals
    from .join import left_join_arrays

    u.log("[rl] left_join_files: start")
    start_time = time()
    if debug:
        gl.DEBUG_JOIN = True
    if lpath or rpath:
        init_globals()
        u.log(f"Loading arrays from '{lpath}' and '{rpath}'...")
        gl.ar_in = u.load_csv(lpath)
        ar_right = u.load_csv(rpath)
        u.log("Arrays loaded")
        u.log_print('|')
    else:
        u.log("Loading right arrays...")
        ar_right = u.load_csv(gl.OUT_SQL)
        u.log("Right array loaded")
    left_join_arrays(gl.ar_in, ar_right)
    if not out:
        out = gl.OUT_PATH
    u.log("Saving output file...")
    u.save_csv(gl.out_array, out)
    s = f"Output file saved in {out}"
    u.log(s)
    dstr = u.get_duration_string(start_time)
    u.log(f"[rl] left_join_files: end ({dstr})")
    u.log_print('|')
Ejemplo n.º 2
0
def group_by():
    out_path = gl.OUT_PATH
    header = u.get_header(out_path, True)
    vol_fields = [elt for elt in header if is_vol_field(elt)]
    if len(vol_fields) == 0:
        return
    else:
        gl.COUNT = True
        vol_field = vol_fields[0]

    if not gl.MERGE_OK or not gl.range_query:
        return

    u.log('Group by on output file...')

    array_in = u.load_csv(out_path)
    gb_fields = [elt for elt in header if not is_vol_field(elt)]
    if gb_fields:
        import pandas as pd
        df = pd.DataFrame(data=array_in[1:], columns=header)
        df[vol_field] = df[vol_field].astype(int)
        df = df.groupby(by=gb_fields).sum()
        df = df.sort_values(by=vol_field, ascending=False)
        df.to_csv(path_or_buf=gl.OUT_PATH, sep=';', encoding='UTF-8')
    else:
        # if this is a simple count result without group by statement
        # results of different files are directly summed (pandas not needed)
        cur_list = [int(elt[0]) for elt in array_in[1:]]
        out = [array_in[0], [str(sum(cur_list))]]
        u.save_csv(out, gl.OUT_PATH)
    u.log('Group by over')
Ejemplo n.º 3
0
def init(kwargs):

    u.init_kwargs(gl, kwargs)
    init_globals()
    u.check_header(gl.IN_PATH)
    u.log(f"Loading input array from '{gl.IN_PATH}'...")
    gl.ar_in = u.load_csv(gl.IN_PATH)
    u.log("Input array loaded")
    u.log_print('|')
Ejemplo n.º 4
0
def test_rl():
    u.init_log('test_rl', True)
    if not ts.is_test_db_defined():
        return

    u.mkdirs(gl.TMP_DIR, True)
    u.mkdirs(ts.gl.TMP_DIR, True)
    u.mkdirs(gl.OUT_DIR, True)
    u.log_print()

    u.log_print('Test join', dashes=100)
    tr.left_join_files(gl.LEFT_1, gl.RIGHT_1, gl.OUT_JOIN_REF_1)
    tr.left_join_files(gl.LEFT_2, gl.RIGHT_2, gl.OUT_JOIN_REF_2)
    tr.left_join_files(gl.LEFT_3, gl.RIGHT_3, gl.OUT_JOIN_REF_3)

    u.log_print('Preparing DB', dashes=100)
    ts.upload(ts.gl.IN)
    arr = u.load_csv(ts.gl.IN)
    arr = [elt[0] for elt in arr]
    u.save_csv(arr, gl.IN_1)

    u.log_print('Test rl - no sql output', dashes=100)
    t.ttry(tr.reqlist, u.g.E_VA, gl.IN_1, gl.OUT_1, gl.QUERY_NO)

    u.log_print('Test rl - no var in query', dashes=100)
    t.ttry(tr.reqlist, u.g.E_MV, gl.IN_1, gl.OUT_1, gl.QUERY_MV)

    u.log_print('Test rl - missing header', dashes=100)
    u.save_csv(arr[1:], gl.IN_MH)
    t.ttry(tr.reqlist, u.g.E_MH, gl.IN_MH, gl.OUT_1, gl.QUERY_1)

    u.log_print('Test rl - standard', dashes=100)
    tr.reqlist(gl.IN_1, gl.OUT_1, gl.QUERY_1, cnx=1)
    tr.reqlist(gl.OUT_1, gl.OUT_2, gl.QUERY_2)
    dq.file_match(ts.gl.IN, gl.OUT_2, del_dup=True)
    dq.file_match(t.gl.OUT_DUP_TMP, gl.OUT_DUP_REF)

    u.log_print('Test rl - interuption and recovery', dashes=100)
    u.mkdirs(gl.TMP_DIR, True)
    u.log_print()
    args = [gl.OUT_1, gl.OUT_3, gl.QUERY_2]
    tr.reqlist_interrupted(*args, cnx=6)
    tr.reqlist(gl.OUT_1, gl.OUT_3, gl.QUERY_2, True, cnx=6)
    dq.file_match(gl.OUT_2, gl.OUT_3)

    ts.clean_db([ts.gl.T_TEST])

    u.check_log(tr.CL)
Ejemplo n.º 5
0
def test_tools():
    u.init_log('test_tools', True)
    u.mkdirs(gl.OUT_DIR, True)
    u.log_print()

    u.log_print("Test tools.xml", dashes=100)
    tt.parse_xml()
    dq.file_match(gl.XML_OUT, gl.XML_OUT_REF)

    u.log_print("Test toolSplit", dashes=100)
    tt.split()

    u.log_print("Test toolDup - to.find_dup simple", dashes=100)
    to.find_dup(gl.DUP_IN, gl.DUP_OUT)
    u.log_print()
    dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF)

    u.log_print("Test toolDup - to.find_dup col", dashes=100)
    to.find_dup(gl.DUP_COL_IN, col=1)
    u.log_print()
    dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF)

    u.log_print("Test toolDup - to.del_dup + shuffle", dashes=100)
    to.shuffle_file(gl.DUP_IN, gl.SHUF_OUT)
    u.log_print()
    to.del_dup(gl.SHUF_OUT, gl.DUP_OUT)
    u.log_print()
    dq.file_match(gl.DUP_OUT, gl.DEL_DUP_OUT_REF)

    u.log_print("Test toolDup - to.find_dup_list", dashes=100)
    list_in = u.load_csv(gl.DUP_IN)
    dup_list = to.find_dup_list(list_in)
    u.save_csv(dup_list, gl.DUP_OUT)
    dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF)

    u.log_print("Test toolFilter", dashes=100)
    tt.flt()

    u.log_print("Test BF", dashes=100)
    tt.read_big_file()
    tt.search_big_file()
    bf.sort_big_file(ts.gl.IN, gl.SORT_BF_OUT)
    dq.file_match(ts.gl.IN, gl.SORT_BF_OUT, del_dup=True)

    u.check_log(tt.CL)
Ejemplo n.º 6
0
def init_find_dup(in_path, out_path, col):

    if not out_path:
        tmp_dir = u.g.dirs['TMP'] + gl.TMP_FOLDER
        u.mkdirs(tmp_dir)
        out_path = tmp_dir + gl.TMP_OUT
    s = "Searching duplicates in "
    if col == 0:
        u.log(f"{s} file {in_path}")
        cur_list = u.load_txt(in_path)
    else:
        u.log(f"{s}column no. {col} of file {in_path}")
        cur_list = u.load_csv(in_path)
        cur_list = [x[col - 1] for x in cur_list]
        if u.has_header(cur_list):
            cur_list = cur_list[1:]

    return (cur_list, out_path)
Ejemplo n.º 7
0
from partools.utils import g
from partools.utils import init_log
from partools.rl import reqlist
from partools.quickstart import files_dir

init_log('rl')

db = 'XE'
cnx_str = 'USERNAME/PWD@localhost:1521/XE'

date = datetime.now().strftime("%Y%m%d")
in_file = f"{g.dirs['IN']}rl_in.csv"
out_file = f"{g.dirs['OUT']}export_RL_{db}_{date}.csv"

# Creates input file from test file
arr = u.load_csv(f'{files_dir}in.csv')
arr = [elt[0:2] for elt in arr]
u.save_csv(arr, in_file)

# The input query has to be variabilized ie. contain @@IN@@:
query_in = """
SELECT AFFAIRE, PRM
FROM TEST
WHERE 1=1
AND AFFAIRE IN @@IN@@
"""

reqlist(
    CNX_INFO=cnx_str,
    QUERY_IN=query_in,
    IN_PATH=in_file,