Exemple #1
0
def main1():
    prod_hdfs = Prod_HDFSTools(conn_type='prod')
    # 递归下载 HDFS 上的文件夹里的文件
    # /user/hive/warehouse/03_basal_layer_hp9clnt200.db/ZTRPT_DWZD
    # /user/hive/warehouse/03_basal_layer_zfybxers00.db/zfybxers00_z_rma_travel_journey_m
    # /user/hive/warehouse/02_logical_layer_001_o_lf_cw.db/BIC/AOCCW01012
    hdfsDirUrl = 'hdfs:///user/hive/warehouse/02_logical_layer_001_o_lf_cw.db/BIC/AOCCW01012'
    localDirUrl = '/my_filed_algos/prod_kudu_data/'

    print('* part1 ')
    hdfsFileUrl_ls = prod_hdfs.downLoadDir_recursion(hdfsDirUrl=hdfsDirUrl,
                                                     localDirUrl=localDirUrl)
    print('* part2 ')
    print('*** 需要处理HDFS文件数 ==> ', len(hdfsFileUrl_ls))

    if os.path.exists(localDirUrl + 'user'):
        shutil.rmtree(localDirUrl + 'user')

    test_hdfs = Test_HDFSTools(conn_type='test')

    print()
    print('* part3 ')
    x_all = datetime.now()
    for index, hdfs_file_url in enumerate(hdfsFileUrl_ls):

        hdfs_file_url = str(hdfs_file_url)
        print(
            f'处理HDFS文件 {len(hdfsFileUrl_ls)} , hdfsFileUrl_ls index => {index}'
        )
        print('prod hdfs_file_url => ', hdfs_file_url)
        local_file_name = hdfs_file_url.replace('hdfs://nameservice1/',
                                                localDirUrl)
        print('local_file_name => ', local_file_name)
        hdfs_file_url = hdfs_file_url.replace('hdfs://nameservice1/user',
                                              'hdfs:///user')
        print('test hdfs_file_url => ', hdfs_file_url)

        time.sleep(0.01)
        x = datetime.now()
        prod_hdfs.downLoadFile2(hdfs_file_url, local_file_name)
        print('*** 下载一个操作HDFS文件共耗时' + str(datetime.now() - x))

        #  上传路径变成小写
        hdfs_file_url = hdfs_file_url.lower()

        time.sleep(0.01)
        x = datetime.now()
        test_hdfs.uploadFile2(hdfsDirPath=hdfs_file_url,
                              localPath=local_file_name)

        if os.path.exists(local_file_name):
            os.remove(local_file_name)
            print(f'delete file {local_file_name}')

        print('*** 上传一个操作HDFS文件共耗时' + str(datetime.now() - x))
        print('')

    print('共耗时' + str(datetime.now() - x_all))
    print('--- ok , completed work ---')
    prod_hdfs.shutdownJVM()
def main():
    # 一共 66880 , 消耗时间  1443   sec
    check_meeting_data()

    test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)
    test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)

    refresh_linshi_table()

    init_file()
    print('--- ok ---')
Exemple #3
0
def upload_hdfs_file():
    test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)

    for year in [
            '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'
    ]:
        dest_file = get_dest_file2(year)
        upload_hdfs_path = get_upload_hdfs_path(year)

        test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path,
                              localPath=dest_file)
Exemple #4
0
def main():
    """
    处理 22304 条记录,操作共耗时 12 sec

    """
    check_car_linshi_data()

    test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)
    test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)

    refresh_linshi_table()

    init_file()
    print('--- ok ---')
Exemple #5
0
def main():
    """


    """
    check_linshi_office_data()

    test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)
    test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)

    refresh_linshi_table()

    #init_file()
    print('--- 办公费临时表数据已经跑完数据了,ok ---')
Exemple #6
0
def check_prod_hdfs():
    test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)
    travel_url = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_travel_linshi_analysis'
    office_url = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_offical_linshi_analysis'
    car_url = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_car_linshi_analysis/car_data.txt'
    test_hdfs.ls(travel_url)
def upload_file():
    test_hdfs = Test_HDFSTools(conn_type='test')
    test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)
def check_linshi_meeting_data(query_date=query_date):
    init_file()

    columns_ls = [
        'finance_meeting_id', 'meet_addr', 'sales_name', 'sales_addressphone',
        'sales_bank', 'sales_taxno', 'invo_code'
    ]
    columns_str = ",".join(columns_ls)

    sql = """
        select {columns_str}
    from 01_datamart_layer_007_h_cw_df.finance_meeting_bill 
    where  !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null) AND pstng_date >= '{query_date}'
        """.format(columns_str=columns_str, query_date=query_date)

    # log.info(sql)
    count_sql = 'select count(a.finance_meeting_id) from ({sql}) a'.format(
        sql=sql)
    log.info(count_sql)
    records = prod_execute_sql(conn_type=CONN_TYPE,
                               sqltype='select',
                               sql=count_sql)
    count_records = int(records[0][0])
    log.info(f'* count_records ==> {count_records}')

    max_size = 2 * 10000
    limit_size = 10000
    select_sql_ls = []

    if count_records >= max_size:
        offset_size = 0
        while offset_size <= count_records:
            if offset_size + limit_size > count_records:
                limit_size = count_records - offset_size

                tmp_sql = """
            select {columns_str}
            from 01_datamart_layer_007_h_cw_df.finance_meeting_bill 
            where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null) AND pstng_date >= '{query_date}'
                order by finance_meeting_id limit {limit_size} offset {offset_size}
                    """.format(columns_str=columns_str,
                               limit_size=limit_size,
                               offset_size=offset_size,
                               query_date=query_date)

                select_sql_ls.append(tmp_sql)
                break
            else:
                tmp_sql = """
            select {columns_str}
            from 01_datamart_layer_007_h_cw_df.finance_meeting_bill 
            where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null ) AND pstng_date >= '{query_date}'
                order by finance_meeting_id limit {limit_size} offset {offset_size}
                    """.format(columns_str=columns_str,
                               limit_size=limit_size,
                               offset_size=offset_size,
                               query_date=query_date)

                select_sql_ls.append(tmp_sql)

            offset_size = offset_size + limit_size
    else:
        tmp_sql = """
            select {columns_str}
            from 01_datamart_layer_007_h_cw_df.finance_meeting_bill 
            where !(sales_name is null and sales_addressphone is null and sales_bank is null and sales_taxno is null and meet_addr is null) AND pstng_date >= '{query_date}'
            """.format(columns_str=columns_str, query_date=query_date)

        select_sql_ls.append(tmp_sql)
        # print('*** tmp_sql => ', tmp_sql)

    log.info(f'*** 开始分页查询,一共 {len(select_sql_ls)} 页')
    start_time = time.perf_counter()

    # threadPool = ThreadPoolExecutor(max_workers=10, thread_name_prefix="thr")
    # all_task = [threadPool.submit(exec_task, (sel_sql)) for sel_sql in select_sql_ls]
    # wait(all_task, return_when=ALL_COMPLETED)
    # threadPool.shutdown(wait=True)

    if count_records > 0:
        pool = Pool(10)
        results = []
        for sel_sql in select_sql_ls:
            rst = pool.spawn(exec_task, sel_sql)
            results.append(rst)

        gevent.joinall(results)

        consumed_time = round(time.perf_counter() - start_time)
        log.info(f'* 处理 {count_records} 条记录, 共操作耗时 {consumed_time} sec')

        test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)
        test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path,
                              localPath=dest_file)

        refresh_linshi_table()

        init_file()
Exemple #9
0
def upload_hdfs_file(year):
    dest_file = get_dest_file(year)
    upload_hdfs_path = get_upload_hdfs_path(year)
    test_hdfs = Test_HDFSTools(conn_type=CONN_TYPE)
    test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)
Exemple #10
0
# -*- coding: utf-8 -*-

from report.commons.logging import get_logger
from report.commons.test_hdfs_tools import HDFSTools as Test_HDFSTools

log = get_logger(__name__)

dest_file = "/you_filed_algos/app/doc/finance_province_city.txt"
upload_hdfs_path = 'hdfs:///user/hive/warehouse/02_logical_layer_007_h_lf_cw.db/finance_province_city/finance_province_city.txt'

conn_type = 'prod'  # test prod
test_hdfs = Test_HDFSTools(conn_type=conn_type)
test_hdfs.uploadFile2(hdfsDirPath=upload_hdfs_path, localPath=dest_file)