コード例 #1
0
def run_hive(configData: ConfigData):
    a_client = InsecureClient(url=configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
    cur = conn.cursor()

    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"

    # hdfs_dir_bl
    root_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))
    file_name = str(pathlib.PurePosixPath(root_path).joinpath(configData.get_file_name(f_date_str)))
    # "/data/posflow/allinpay_utf8_zc/20181101/"
    # 20181101_loginfo_rsp_bl_new.csv
    # 20181101_rsp_agt_bl_new.del
    # 20181101_rxinfo_rsp_bl.txt

    table_name = configData.get_table_name()

    print("Start\n")

    if MyHdfsFile.isfile(a_client, file_name):
        if not configData.get_has_partition():
            sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(file_name, table_name)  # 'test.t1_trxrecprd_v2_zc'
            # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
        else:
            sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(file_name, table_name, p_date_str)  # 'test.t1_trxrecprd_v2_zc'
        print("OK" + "  " + sql+"\n")
        cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
コード例 #2
0
def run_hive(configData: ConfigData):
    a_client = InsecureClient(url=configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
    cur = conn.cursor()

    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"

    root_path = configData.get_hdfs_path()  # "/shouyinbao/bl_shouyinbao/UTF8/"
    file_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"
    table_name = configData.get_table_name()

    print("Start\n")

    idn = 0
    branches = MyHdfsFile.get_child(a_client, str(pathlib.PurePosixPath(root_path).joinpath(f_date_str)))
    for aBranch in branches:
        if MyHdfsFile.check_branch(a_client, aBranch):
            files = MyHdfsFile.get_child(a_client, aBranch)
            f_a_branch = MyHdfsFile.get_name(aBranch)
            for aFile in files:
                if MyHdfsFile.check_file(a_client, aFile, file_name):
                    # '/shouyinbao/bl_shouyinbao/UTF8/20181101/9999997900/t1_trxrecord_20181101_V2.csv'
                    to_file2 = str(pathlib.PurePosixPath(root_path).joinpath(f_date_str, f_a_branch, file_name))
                    if not configData.get_has_partition():
                        sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(to_file2, table_name)  # 'test.t1_trxrecprd_v2_zc'
                    # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
                    else:
                        sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(to_file2, table_name, p_date_str)  # 'test.t1_trxrecprd_v2_zc'
                    idn += 1
                    print(str(idn) + "  " + sql + "\n")
                    cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
コード例 #3
0
def run_conv_file_local_to_hdfs(configData: ConfigData):
    """

    # client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True)
    # dat = client.list('/shouyinbao/', status=False)
    # print(dat)

    # root_path = "/home/bd/桌面/201811_flow/zc_shouyinbao/UNZIP/"
    # dest_dir1 = "/home/bd/桌面/201811_flow/zc_shouyinbao/UTF8/"
    # dest_dir2 = "/shouyinbao/zc_shouyinbao/UTF8/"

    # root_path = "/home/testFolder/logflow/bl_shouyinbao/UNZIP/"
    # dest_dir1 = "/home/testFolder/logflow/bl_shouyinbao/UTF8/"
    # dest_dir2 = "/shouyinbao/zc_shouyinbao/UTF8/"

    # i_file = '/home/testFolder/logflow/bl_shouyinbao/20181101/9999100000/t1_trxrecord_20181101_V2.csv'
    # o_file = '/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv'

    :param configData:
    :param the_date:
    :param is_baoli:
    :return:
    """
    f_date_str = configData.get_f_date()  # "20181101"

    a_client = InsecureClient(configData.hdfs_ip(), user="******")   # "http://10.2.201.197:50070"
    # webhdfs 默认是 dr.who ,不能伪装成其他用户,可以在配置里修改 hadoop.http.staticuser.user=dr.who
    # https://www.cnblogs.com/peizhe123/p/5540845.html
    root_path = os.path.join(configData.get_data_path(), f_date_str)
    dest_dir1 = os.path.join(configData.get_utf8_path(), f_date_str)
    dest_dir2 = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))

    print("Start\n")

    f_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"

    branches = MyLocalFile.get_child_dir(root_path)
    for aBranch in branches:
        if MyLocalFile.check_branch(aBranch):
            files = MyLocalFile.get_child_file(aBranch)
            f_a_branch = os.path.basename(aBranch)
            for aFile in files:
                if MyLocalFile.check_file(aFile, f_name):
                    to_file1 = str(pathlib.PurePath(dest_dir1).joinpath(f_a_branch, f_name))
                    to_file2 = str(pathlib.PurePosixPath(dest_dir2).joinpath(f_a_branch, f_name))
                    f_add_head = configData.get_hive_add_date(f_a_branch)
                    f_add_end = configData.get_hive_add_date("789")
                    f_need_head = not configData.get_hive_head()  # False
                    MyLocalFile.conv_file_local(aFile, to_file1, need_first_line=f_need_head,p_add_head=f_add_head, p_add_tail=f_add_end,quoting="")
                    MyHdfsFile.safe_make_dir(a_client, to_file2)
                    # client.newupload(to_file2, to_file1, encoding='utf-8')
                    the_file = a_client.status(to_file2, strict=False)
                    if the_file is None:
                        a_client.upload(to_file2, to_file1) #, encoding='utf-8')
                        a_client.set_permission(to_file2, 777)
                    # a_client.set_owner(thePath,owner='hdfs',group='supergroup')
                    elif the_file['type'].lower() == 'file':  # 'directory'
                        a_client.set_permission(to_file2, 777)
コード例 #4
0
def run_remove_files(configData: ConfigData):
    f_date_str = configData.get_f_date()  # StrTool.get_the_date_str(the_date, delta_day)  # "20181101"
    data_path = os.path.join(configData.get_data_path(), f_date_str)   # allinpay_data_bl
    utf8_path = os.path.join(configData.get_utf8_path(), f_date_str)   # allinpay_utf8_bl
    hdfs_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))    # hdfs_dir_bl

    a_client = InsecureClient(configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"

    shutil.rmtree(data_path, ignore_errors=True)
    shutil.rmtree(utf8_path, ignore_errors=True)
    try:
        a_client.delete(hdfs_path, recursive=True)
    except:
        pass
コード例 #5
0
def run_conv_file_local(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"

    root_path = configData.get_data_path()
    dest_dir = configData.get_utf8_path()

    f_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"

    print("Start\n")

    branches = MyLocalFile.get_child(os.path.join(root_path, f_date_str))
    for aBranch in branches:
        if MyLocalFile.check_branch(aBranch):
            files = MyLocalFile.get_child(aBranch)
            for aFile in files:
                if MyLocalFile.check_file(aFile, f_name):
                    MyLocalFile.conv_file_local(aFile, os.path.join(dest_dir, f_date_str, os.path.basename(aBranch), f_name), True)
コード例 #6
0
def run_unzip_file(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"

    zip_path = os.path.join(configData.get_zip_path(), f_date_str)
    # root_path = configData.get_data("allinpay_data_zc")
    data_path = os.path.join(configData.get_data_path(), f_date_str)   # allinpay_data_zc

    #    ifile = '/home/testFolder/logflow/bl_shouyinbao/20181101/9999100000/t1_trxrecord_20181101_V2.csv'
    #    ofile = '/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv'

    print("Start\n")

    # os.path.join(root_path, the_date) # real SYB folder don't have date folder

    f_name = configData.get_zip_name(f_date_str)  # 3= the_date+".zip" # 5 = the_date+"_agt.zip"
    a_file = os.path.join(zip_path, f_name)
    p_name = configData.get_file_name(f_date_str)  # p_date+"*"
    if MyLocalFile.check_file(a_file):
        MyLocalFile.unzip_the_file(a_file, data_path, p_name=p_name)
コード例 #7
0
def run_remove_hive(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"

    del_table = configData.get_table_name()   # "hive_table" + str(configData.the_id) # "rds_posflow.loginfo_rsp_bl"
    print(configData.cdh_ip()+del_table+f_date_str+configData.get_file_name(f_date_str)+configData.hive_ip())
    if not configData.get_has_partition():
        del_file = configData.get_file_name(f_date_str).replace('.', '*.')  # "file_ext" + str(configData.the_id)
        MyHdfsFile.delete_hive_ssh(configData.cdh_ip(), table=del_table, p_name=del_file, username=configData.cdh_user(), password=configData.cdh_pass())

    else:
        conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
        cur = conn.cursor()

        # "ALTER TABLE rds_posflow.t1_trxrecprd_v2_tmp DROP IF EXISTS PARTITION(p_date='2019-02-08') "
        sql = "ALTER TABLE {} DROP IF EXISTS PARTITION( p_date='{}' )".format(del_table, p_date_str)
        print(sql)
        cur.execute(sql)

        cur.close()
        conn.close()
コード例 #8
0
def run_unzip_file(configData: ConfigData, folder_type=2):
    f_date_str = configData.get_f_date()  # "20181101"

    if (type(f_date_str) is str) and len(f_date_str) == 8:
        m_month = f_date_str[0:6]
        m_day = f_date_str[6:8]
    else:
        return

    zip_path = configData.get_zip_path()
    data_path = configData.get_data_path()

    f_name = configData.get_zip_name("")  # "t1_trxrecord_" the_date # "_V2.csv"

    print("Start\n")

    # os.path.join(root_path, the_date) # real SYB folder don't have date folder
    branches = MyLocalFile.get_child_dir(zip_path)
    for aBranch in branches:
        if MyLocalFile.check_branch(aBranch):
            months = MyLocalFile.get_child_dir(aBranch)
            for aMonth in months:
                the_month = MyLocalFile.check_month(aMonth)
                if the_month > 0 and "{:0>6d}".format(the_month) == m_month:
                    day_list = MyLocalFile.get_child_dir(aMonth)
                    for aDay in day_list:
                        the_day = MyLocalFile.check_day(aDay)
                        if the_day > 0 and "{:0>2d}".format(the_day) == m_day:
                            files = MyLocalFile.get_child_file(aDay)
                            for aFile in files:
                                if MyLocalFile.check_file(aFile, p_name=f_name):
                                    short_name = os.path.basename(aBranch)
                                    if folder_type == 1:
                                        new_path = os.path.join(data_path, m_month, m_day, short_name)
                                        # "{:0>6d}".format(month)  "{:0>2d}".format(day)
                                    else:
                                        new_path = os.path.join(data_path, f_date_str, short_name)
                                        # "{:0>6d}{:0>2d}".format(month, day)
                                    p_name = configData.get_file_name(f_date_str)
                                    MyLocalFile.unzip_the_file(aFile, new_path, p_name)
コード例 #9
0
def run_sftp_file(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"

    # allinpay_ftp_folder_bl_1 or allinpay_ftp_folder_bl_2
    f_dir = configData.get_remote_path_ftp(f_date_str)
    # allinpay_data_bl
    t_dir = os.path.join(configData.get_local_path_ftp(), f_date_str)
    # "file_ext" + str(configData.the_id)
    file_name = configData.get_ftp_name(f_date_str)

    a = sftp_tool.Sftp_Tool(h=configData.get_ftp_ip(), p=int(configData.get_ftp_port()),
                            u=configData.get_ftp_user(), s=configData.get_ftp_pass(),
                            r=f_dir, d=t_dir)
    a.openSFTP()
    a.download_files(from_dir=f_dir, to_dir=t_dir, p_name=file_name)
コード例 #10
0
def run_conv_file_hdfs(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"

    client = InsecureClient(configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    root_path = configData.get_data_path()  # 'D:/DATA/UNZIP/'
    dest_dir = configData.get_hdfs_path()

    f_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"

    print("Start\n")

    branches = MyLocalFile.get_child(os.path.join(root_path, f_date_str))
    for aBranch in branches:
        if MyLocalFile.check_branch(aBranch):
            files = MyLocalFile.get_child(aBranch)
            for aFile in files:
                if MyLocalFile.check_file(aFile, f_name):
                    MyHdfsFile.conv_file_hdfs(aFile,
                                              os.path.join(dest_dir,
                                                           f_date_str,
                                                           os.path.basename(aBranch),
                                                           f_name),
                                              client)
コード例 #11
0
def run_conv_file_local_to_hdfs(configData: ConfigData):
    """

    :param configData:
    :return:
    """
    f_date_str = configData.get_f_date()  # "20181101"
    a_client = InsecureClient(configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"

    root_path = os.path.join(configData.get_data_path(), f_date_str)                        # allinpay_data_bl
    dest_dir1 = os.path.join(configData.get_utf8_path(), f_date_str)                        # allinpay_utf8_bl
    dest_dir2 = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) # hdfs_dir_bl
    # file_ext7 = configData.get_data("file_ext7")  # _loginfo_rsp_bl_new.csv   # 20181101_loginfo_rsp_bl_new.csv
    # file_ext8 = configData.get_data("file_ext8")  # _rsp_agt_bl_new.del       # 20181101_rsp_agt_bl_new.del
    # file_ext9 = configData.get_data("file_ext9")  # _rxinfo_rsp_bl.txt        # 20181101_rxinfo_rsp_bl.txt

    # f_list = [file_ext7, file_ext8, file_ext9]

    print("Start\n")

    # "file_ext" + str(configData.the_id)
    file_name = configData.get_file_name(f_date_str).lower()

    files = MyLocalFile.get_child_file(root_path)
    for aFile in files:
        short_name = os.path.basename(aFile).lower()
        f_name = pathlib.PurePath(aFile).name
        if short_name == file_name:
            to_file1 = str(pathlib.PurePath(dest_dir1).joinpath(f_name))
            to_file2 = str(pathlib.PurePosixPath(dest_dir2).joinpath(f_name))
            f_add_date = configData.get_hive_add_date(f_date_str)
            f_need_head = not configData.get_hive_head()
            MyLocalFile.conv_file_local(aFile, to_file1, need_first_line=f_need_head, p_add_head=f_add_date)
            MyHdfsFile.safe_make_dir(a_client, to_file2)
            # a_client.newupload(to_file2, to_file1, encoding='utf-8')
            the_file = a_client.status(to_file2, strict=False)
            if the_file is None:
                a_client.upload(to_file2, to_file1)
                a_client.set_permission(to_file2, 777)
            # a_client.set_owner(thePath,owner='hdfs',group='supergroup')
            elif the_file['type'].lower() == 'file':  # 'directory'
                a_client.set_permission(to_file2, 777)
コード例 #12
0
def run_remove_hive(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"
    # "/user/hive/warehouse/rds_posflow.db/t1_trxrecprd_v2/t1_trxrecord_20181204_V2*.csv"

    del_table = configData.get_table_name()   # hive_table="rds_posflow.t1_trxrecprd_v2"

    if not configData.get_has_partition():
        del_file = configData.get_file_name(f_date_str).replace('.', '*.')
        MyHdfsFile.delete_hive_ssh(configData.cdh_ip(), table=del_table, p_name=del_file, username=configData.cdh_user(), password=configData.cdh_pass())

    else:
        conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
        cur = conn.cursor()

        # "ALTER TABLE rds_posflow.t1_trxrecprd_v2_tmp DROP IF EXISTS PARTITION(p_date=2019-02-08) "
        sql = "ALTER TABLE {} DROP IF EXISTS PARTITION( p_date='{}' )".format(del_table, p_date_str)
        print(sql)
        cur.execute(sql)

        cur.close()
        conn.close()
コード例 #13
0
    if m_is_test:
        m_project_id = 3
        start_date_str = "20180901"
        m_days = 9

        m_project_id = StrTool.get_param_int(1, 3)
        start_date_str = StrTool.get_the_date_str(StrTool.get_param_str(2, ""))
        m_days = StrTool.get_param_int(3, 1)
    else:
        m_project_id = StrTool.get_param_int(1, 3)
        start_date_str = StrTool.get_the_date_str(StrTool.get_param_str(2, ""))
        m_days = StrTool.get_param_int(3, 1)

    start_date = StrTool.get_the_date(start_date_str)
    the_conf = ConfigData(m_project_id, StrTool.get_the_date_str_by_date(start_date, 0, 10), p_is_test=m_is_test)

    for i in range(0, m_days):
        delta = m_days - i - 1  # 不多加1天,20190108处理的是20190108文件夹
        # delta = days - i - 1 + 1  # 多加1天,是因为20190108处理的是20190107文件夹

        # 收银宝文件没有多 delta 1天
        # 1、20190110 191    2019-1-10    2018-7-4
        # 2、1 20180703 191  2018-7-3     2017-12-25 (2019-1-24 晚上)

        # 保理流水
        # 2、之前是到 20180702, 是先191天,之后手工多补了一天 20180702
        # 3、main3.py 7 20180702 70, 处理 20180702-20180423 (2019-1-25中午)
        # 4、main3.py 8 20180702 70, 处理 20180702-20180423 (2019-1-25中午)
        # 修改路径 remote_path_ftp_7="/ftpdata/thblposloan/posflow2/"
        # 5、main3.py 7 20180420 201      (2019-1-25中午)
コード例 #14
0
if __name__ == "__main__":
    # http://10.91.1.21:50070/webhdfs/v1/Project?op=LISTSTATUS&user.name=hdfs
    m_is_test = False

    m_project_id = StrTool.get_param_int(1, 2)
    start_date_str = StrTool.get_the_date_str(StrTool.get_param_str(2, ""))
    m_days = StrTool.get_param_int(3, 1)


    if m_is_test:
        m_project_id = 2
        start_date_str = "20180901"
        m_days = 9  # 190

    start_date = StrTool.get_the_date(start_date_str)
    the_conf = ConfigData(m_project_id, StrTool.get_the_date_str_by_date(start_date, 0, 10), p_is_test=m_is_test)

    if m_project_id == 1:
        return_info = subprocess.run("/app/code/posflow_loader/ftpcmd.sh", shell=True)
        print(return_info.returncode)

    f_delta = the_conf.get_file_date_delta()
    # start_date_str = StrTool.get_the_date_str(start_date_str, - int(f_delta))

    del_range = 30  # 删除旧数据的时间范围,天
    keep_range = 7  # 保留最近旧数据的时间范围,天

    for i in range(0, del_range):
        delta = m_days + keep_range + del_range - 1 - i
        date2 = start_date - datetime.timedelta(days=delta)
        m_day_str3 = date2.strftime("%Y-%m-%d")