def merge_file(): sub_dir_path = BASE_FILE_PATH.get_sub_dir_path() input_graph_file_path = BASE_FILE_PATH.get_input_graph_path() input_nodes_file_path = BASE_FILE_PATH.get_input_nodes_file_path() grnt_rel_path = BASE_FILE_PATH.get_output_rel_path() # os.system(''' # if [ ! -d "{0}" ]; then # mkdir {0} # else # rm {0}* # fi # '''.format(sub_dir_path)) os.system(''' if [ ! -d "{0}" ]; then mkdir {0} fi '''.format(sub_dir_path)) if not os.path.exists(input_graph_file_path): os.system('touch %s' % input_graph_file_path) os.system( 'hdfs dfs -getmerge %s %s' % (BASE_SPARK.get_hdfs_graph_file_path(), input_graph_file_path)) if not os.path.exists(input_nodes_file_path): os.system( 'hdfs dfs -getmerge %s %s' % (BASE_SPARK.get_hdfs_nodes_file_path(), input_nodes_file_path)) if not os.path.exists(grnt_rel_path): os.system('touch %s' % grnt_rel_path) local_graph_size = int( os.popen('ls -la {0} | cut -d " " -f 5'.format( input_graph_file_path)).readlines()[0]) hdfs_graph_size = int( os.popen('hdfs dfs -du -s {0} | cut -d " " -f 1'.format( BASE_SPARK.get_hdfs_graph_file_path())).readlines()[0]) local_nodes_size = int( os.popen('ls -la {0} | cut -d " " -f 5'.format( input_nodes_file_path)).readlines()[0]) hdfs_nodes_size = int( os.popen('hdfs dfs -du -s {0} | cut -d " " -f 1'.format( BASE_SPARK.get_hdfs_nodes_file_path())).readlines()[0]) if local_graph_size != hdfs_graph_size: return False if local_nodes_size != hdfs_nodes_size: return False return True
# /usr/bin/env/ python # -*- coding: utf-8 -*- import sys from pyspark.sql import SparkSession from pyspark.storagelevel import StorageLevel from datetime import datetime, timedelta from config import BASE_SPARK sys.path.append(BASE_SPARK.get_graphframes_path()) from graphframe import GraphFrame reload(sys) sys.setdefaultencoding('utf-8') def get_now_date(): return datetime.now().strftime('%Y%m%d') def get_date_before_today(): today = datetime.now() delta = timedelta(days=-1) yesterday = today + delta return yesterday.strftime('%Y-%m-%d') def get_batch(): today = datetime.now() delta = timedelta(days=-1) yesterday = today + delta return yesterday.strftime('%Y%m%d')
stat = merge_file() if not stat: logging.info('Get file from HDFS error!') sys.exit(1) from utils import NODE level = int(sys.argv[1]) or 10 output_rel_path = BASE_FILE_PATH.get_output_rel_path() output_eid_mapping_path = BASE_FILE_PATH.get_output_eid_mapping_path() main(output_rel_path, output_eid_mapping_path, level) tmp = os.popen('hdfs dfs -stat %s' % BASE_SPARK.get_hdfs_rel_json_path()).readlines() if len(tmp): os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_rel_json_path()) os.system('hdfs dfs -put %s %s' % (output_rel_path, BASE_SPARK.get_hdfs_rel_json_path())) tmp = os.popen('hdfs dfs -stat %s' % BASE_SPARK.get_hdfs_eid_mapping_json_path()).readlines() if len(tmp): os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_eid_mapping_json_path()) os.system( 'hdfs dfs -put %s %s' % (output_eid_mapping_path, BASE_SPARK.get_hdfs_eid_mapping_json_path())) logging.info('=====Processing done at %s!!!=====' % get_date())
writer.terminate() writer.join() break fp.close() if __name__ == '__main__': logging.info('=====Processing start at %s!!!=====' % get_date()) stat = merge_file() if not stat: logging.error('Get file from HDFS error!') sys.exit(1) from utils import NODE level = int(sys.argv[1]) or 10 output_list_path = BASE_FILE_PATH.get_output_list_path() main(output_list_path, level) tmp = os.popen('hdfs dfs -stat %s' % BASE_SPARK.get_hdfs_list_json_path()).readlines() if len(tmp): os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_list_json_path()) os.system('hdfs dfs -put %s %s' % (output_list_path, BASE_SPARK.get_hdfs_list_json_path())) logging.info('=====Processing done at %s!!!=====' % get_date())
def connected_components(base_table_guarantee, base_table_custid_eid, table_exchange_rate): grnt_latest_batch = str( spark.sql("select max(batch) batch from %s" % base_table_guarantee).take(1)[0]['batch']) custid_latest_batch = str( spark.sql("select max(batch) batch from %s" % base_table_custid_eid).take(1)[0]['batch']) exchange_max_batch = str( spark.sql("select cast(max(dw_dat_dt) as string) batch from %s" % table_exchange_rate).take(1)[0]['batch']) if exchange_max_batch == yesterday_date: exchange_batch = yesterday_date else: exchange_batch = exchange_max_batch # filter out data without customer id and name spark.sql(''' select * from {base_table_guarantee} where batch = '{batch}' and !((grnt_custid = '' or grnt_custid is null) and (grnt_nm = '' or grnt_nm is null)) and !((bor_custid = '' or bor_custid is null) and (bor_nm = '' or bor_nm is null)) '''.format(base_table_guarantee=base_table_guarantee, batch=grnt_latest_batch)).createOrReplaceTempView( 'base_guarantee_relation') spark.sql(''' select t1.grnt_cod as node_id, t2.eid, case when t1.grnt_custid is not null then substr(t1.grnt_custid, 0, 2) else 'UNKNOWN' end as node_type, t1.grnt_nm as node_name from (select * from base_guarantee_relation where batch = '{grnt_batch}') t1 left outer join (select * from {t_custid_eid} where batch = '{custid_batch}') t2 on t1.grnt_custid = t2.cust_id union select t3.bor_cod as node_id, t4.eid, case when t3.bor_custid is not null then substr(t3.bor_custid, 0, 2) else 'UNKNOWN' end as node_type, t3.bor_nm as node_name from (select * from base_guarantee_relation where batch = '{grnt_batch}') t3 left outer join (select * from {t_custid_eid} where batch = '{custid_batch}') t4 on t3.bor_custid = t4.cust_id '''.format(grnt_batch=grnt_latest_batch, t_custid_eid=base_table_custid_eid, custid_batch=custid_latest_batch)).createOrReplaceTempView( 'guarantee_tag') spark.sql(''' select a.node_id, case when a.eid is not null then a.eid when a.eid is null and a.node_type = 'CM' then b.eid when a.eid is null and a.node_type = 'UNKNOWN' then b.eid end as eid, a.node_type, a.node_name from guarantee_tag a left outer join (select eid, name from enterprises.hive_enterprises) b on a.node_name = b.name ''').distinct().createOrReplaceTempView("guarantee_tag_table") spark.sql(''' select t1.*, case when t2.cnv_cny_exr is null then 1 else t2.cnv_cny_exr end as cnv_cny_exr from (select * from base_guarantee_relation where batch = '{grnt_batch}') t1 left outer join (select currency, cnv_cny_exr from {t_ex_rate} where dw_dat_dt = '{exchange_batch}') t2 on t1.currency = t2.currency '''.format(grnt_batch=grnt_latest_batch, t_ex_rate=table_exchange_rate, exchange_batch=exchange_batch)).createOrReplaceTempView( 'guarantee_rel_with_currency') spark.sql(''' select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype, currency, guaranteeamount, sgn_dt, data_src, cast(cnv_cny_guaranteeamount as decimal(18,2)), cast(cnv_cny_exr as decimal(18, 2)) from (select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype, currency, guaranteeamount, sgn_dt, data_src, batch, cnv_cny_exr, guaranteeamount * cnv_cny_exr as cnv_cny_guaranteeamount from guarantee_rel_with_currency) ''').createOrReplaceTempView('guarantee_rel_table') spark.sql('cache table guarantee_tag_table') spark.sql('cache table guarantee_rel_table') v = spark.sql('select node_id as id from guarantee_tag_table').distinct() v.persist(StorageLevel.MEMORY_AND_DISK_SER) e = spark.sql(''' select grnt_cod as src , bor_cod as dst, 'guarantee' as relationship from guarantee_rel_table where grnt_cod != '' and bor_cod != '' and grnt_cod != bor_cod ''').distinct() e.persist(StorageLevel.MEMORY_AND_DISK_SER) graph = GraphFrame(v, e) graph.connectedComponents().createOrReplaceTempView('cc_df') cc_count = spark.table('cc_df').select('component').distinct().count() logging.info('Connected components finished! Count: %d' % cc_count) e.createOrReplaceTempView('e') spark.sql(''' select t2.src, t2.dst, cast(t1.component as string) from cc_df t1 left outer join e t2 on t1.id = t2.src where t2.src is not null and t2.dst is not null ''').distinct().createOrReplaceTempView('v_1') spark.sql(''' select t1.src, t1.dst, named_struct( "dw_stat_dt", cast(t2.dw_stat_dt as string), "grnt_cod", t2.grnt_cod, "grnt_custid", t2.grnt_custid, "grnt_nm", t2.grnt_nm, "bor_cod", t2.bor_cod, "bor_custid", t2.bor_custid, "bor_nm", t2.bor_nm, "grnt_ctr_id", t2.grnt_ctr_id, "guaranteetype", t2.guaranteetype, "currency", t2.currency, "guaranteeamount", cast(t2.guaranteeamount as string), "sgn_dt", cast(t2.sgn_dt as string), "data_src", t2.data_src, "cnv_cny_guaranteeamount", cast(t2.cnv_cny_guaranteeamount as string) ) as attr, t1.component from (select * from v_1) t1 left outer join (select * from guarantee_rel_table) t2 on t1.src = t2.grnt_cod and t1.dst = t2.bor_cod where t2.grnt_cod is not null and t2.bor_cod is not null ''').createOrReplaceTempView('v_component_with_attr') spark.sql(''' select named_struct( "link", array(src, dst), "attrs", attrs) as link, component from (select src, dst, collect_list(attr) as attrs, component from v_component_with_attr group by src, dst, component) ''').createOrReplaceTempView('v_component') spark.sql(''' select component, collect_list(link) as links from v_component group by component ''').createOrReplaceTempView('v_links') spark.sql('''select component, links from v_links''' ).createOrReplaceTempView('grnt_graph') spark.table('grnt_graph').write.mode('overwrite').json( BASE_SPARK.get_hdfs_graph_file_path()) logging.info('Write graph file finished') spark.sql(''' select explode(t2.link) as node_id from grnt_graph t1 left outer join (select link.link as link, component from v_component) t2 on t1.component = t2.component where t2.component is not null ''').distinct().createOrReplaceTempView('graph_nodes') spark.sql(''' select t2.node_id, named_struct("node_id", t2.node_id, "eid", t2.eid, "node_type", t2.node_type, "node_name", t2.node_name) as node from graph_nodes t1 left outer join guarantee_tag_table t2 on t1.node_id = t2.node_id where t2.node_id is not null and t2.node_id != '' ''').createOrReplaceTempView('graph_nodes_with_attr') spark.table('graph_nodes_with_attr').write.mode('overwrite').json( BASE_SPARK.get_hdfs_nodes_file_path()) logging.info('Write nodes file finished')
# /usr/bin/env/ python # -*- coding: utf-8 -*- import sys from pyspark.sql import SparkSession from pyspark.storagelevel import StorageLevel from datetime import datetime, timedelta import logging from config import BASE_SPARK sys.path.append(BASE_SPARK.get_graphframes_path()) from graphframe import GraphFrame reload(sys) sys.setdefaultencoding('utf-8') logging.basicConfig( level=logging.INFO, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') def get_date_before_today(): today = datetime.now() delta = timedelta(days=-1) yesterday = today + delta return yesterday.strftime('%Y-%m-%d') def connected_components(base_table_guarantee, base_table_custid_eid, table_exchange_rate): grnt_latest_batch = str(
# /usr/bin/env/ python # -*- coding: utf-8 -*- import os from pyspark.sql import SparkSession from config import BASE_SPARK if __name__ == '__main__': spark = SparkSession \ .builder \ .appName('guarantee_relation') \ .config('spark.log.level', 'WARN') \ .enableHiveSupport() \ .getOrCreate() hdfs_list_json = BASE_SPARK.get_hdfs_list_json_path() temp = os.popen('hdfs dfs -stat %s' % hdfs_list_json).readlines() if len(temp): json_data = spark.read.json('hdfs://%s' % hdfs_list_json) json_data.createOrReplaceTempView('output_table') spark.table('output_table').write.mode('overwrite').saveAsTable( BASE_SPARK.get_output_list_table()) os.system( 'hive -e "INSERT OVERWRITE TABLE {0} SELECT id, value FROM {1};"'. format(BASE_SPARK.get_hbase_guarantee_list_table(), BASE_SPARK.get_output_list_table()))
def __init__(self): self.hdfs_node_file_path = BASE_SPARK.get_hdfs_nodes_file_path() self.input_nodes_file_path = BASE_FILE_PATH.get_input_nodes_file_path() self.nodes = self.prepare_nodes()
# -*- coding: utf-8 -*- import os from pyspark.sql import SparkSession from config import BASE_SPARK import traceback if __name__ == '__main__': try: spark = SparkSession \ .builder \ .appName('guarantee_relationship') \ .config('spark.log.level', 'WARN') \ .enableHiveSupport() \ .getOrCreate() hdfs_rel_json = BASE_SPARK.get_hdfs_rel_json_path() temp = os.popen('hdfs dfs -stat %s' % hdfs_rel_json).readlines() if len(temp): json_data = spark.read.json('hdfs://%s' % hdfs_rel_json) json_data.createOrReplaceTempView('output_table') spark.table('output_table').write.mode('overwrite').saveAsTable( BASE_SPARK.get_output_rel_table()) os.system( 'hive -e "INSERT OVERWRITE TABLE test.grnt1 SELECT id, value FROM {0};"' .format(BASE_SPARK.get_output_rel_table())) hdfs_eid_mapping_json = BASE_SPARK.get_hdfs_eid_mapping_json_path() temp = os.popen('hdfs dfs -stat %s' %