Example #1
0
 def load(self,file,plot=True):
     """Load data from the file given in the path string."""
     self.time = -1
     base = 0.1
     count = 0
     uplim = 31
     
     y = np.fromfile(file,np.int32,-1)
     x = np.arange(0,50001,1,dtype=np.float32)
     x = self.convertTime(x)
 #
     #f = ip.interp1d(x,y)
     #p0 = [10., 0.0001, 10000., 2.]
     #p1, cov = op.curve_fit(self.func, x, y, p0)
     #print(p1)
     #p1f = self.func(x, *p1) #Fit of neutron monitor
     #xs = range(200)
     #xs = np.arange(0,35,0.1)
     #ys = np.zeros(350,dtype=np.uint32)
 
     xs = np.arange(0,uplim,base)
     ys = np.zeros(uplim/base,dtype=np.uint32)
 
     for (i,j) in zip(x,y):
         if i > base:
             ys[int(10*base-1)]=count
             base += 0.1
             count = 0
         count += j
     base = 0.1
     count = 0
     uplim = 31
 #
     ymean = np.mean(ys[250:300])
     ynew = np.array([float(yo)-ymean*(uplim/base)/50001 for yo in y])
 #
     ysnew = np.zeros(uplim/base,dtype=np.float32)
     for (i,j) in zip(x,ynew):
         if i > base:
             ysnew[int(10*base-1)]=count
             base += 0.1
             count = 0
         count += j
 #
     if plot:
         graph = GraphFrame(None,"Monitor")
         graph2 = GraphFrame(None,"Monitor Adjusted")
     #        graph.plot(np.arange(0.0,20.0,0.1),f(xs))
     #    graph.plot(xs,ys)
         graph2.plot(xs,ysnew)          
 #
     self.spec = np.expand_dims(np.expand_dims(y,axis=0),axis=0)
Example #2
0
 def load(self,file,wxon=True):
     """Load data from the file given in the path string."""
     with open(file,"r") as infile:
         infile.readline() # drop first line
         line = infile.readline()
         #Find the time information
         m = re.search("lasted (\d+) milliseconds",line)
         if m:
             self.time = float(m.group(1))
         else:
             raise InvalidFileError
     data = np.loadtxt(file,dtype=np.float32,skiprows=3)
     x = data[:,0]
     y = data[:,1]
     y += 1
     x = self.convertTime(x)
     f = ip.interp1d(x,y) #Interpolation of neutron monitor
     xs = range(200)
     if(wxon):
         graph = GraphFrame(None,"Monitor")
         graph.plot(np.arange(0.0,20.0,0.1),f(xs))
     self.spec = np.expand_dims(np.expand_dims(f(xs),axis=0),axis=0)
Example #3
0
    CONF = (SparkConf().setMaster("local").setAppName("Prime").set(
        "spark.executor.memory", "4g").set("spark.executor.instances", "4"))

    sc = SparkContext(conf=CONF)
    sqlContext = SQLContext(sc)

    localVertices = [(0, 0, False, 0), (1, 1000, False, -1),
                     (2, 1000, False, -1), (3, 1000, False, -1),
                     (4, 1000, False, -1)]
    localEdges = [(0, 1, 10), (2, 0, 20), (1, 2, 22), (1, 0, 10), (0, 2, 20),
                  (2, 1, 22), (1, 4, 14), (4, 1, 14), (2, 3, 32), (3, 2, 32),
                  (3, 1, 13), (1, 3, 13)]
    v = sqlContext.createDataFrame(localVertices,
                                   ["id", "cost", "flag", "parent"])
    e = sqlContext.createDataFrame(localEdges, ["src", "dst", "dist"])
    g = GraphFrame(v, e)

    def getMinCost(x, y):
        if x[1] <= y[1]:
            return x
        else:
            return y

    def update(x):
        if x[0] == currentVertexId:
            x[2] = True
        if (x[2] == False) & (x[0] in currentEdgesDst):
            tmpDist = currentEdgesDist[currentEdgesDst.index(x[0])]
            if x[1] > tmpDist:
                x[1] = tmpDist
                x[3] = currentVertexId
Example #4
0
    def connected_components(self):
        base_table_guarantee = self.base_table_guarantee
        base_table_custid_eid = self.base_table_custid_eid
        table_exchange_rate = self.table_exchange_rate
        today_date = self.today_date
        yesterday_date = self.yesterday_date

        spark.sql('''
            select t1.grnt_custid as node_id, t2.eid, 
                substr(t1.grnt_custid, 0, 2) as node_type, t1.grnt_nm as node_name
            from 
            (select * from {0} where batch = '{2}') t1
            left outer join
            (select * from {1} where batch = '{2}') t2
            on t1.grnt_custid = t2.cust_id
            union
            select t3.bor_custid as node_id, t4.eid, 
                substr(t3.bor_custid, 0, 2) as node_type, t3.bor_nm as node_name
            from
            (select * from {0} where batch = '{2}') t3
            left outer join
            (select * from {1} where batch = '{2}') t4
            on t3.bor_custid = t4.cust_id
            '''.format(base_table_guarantee, base_table_custid_eid,
                       today_date)).createOrReplaceTempView('guarantee_tag')

        spark.sql('''
            select a.node_id, case when a.eid is not null then a.eid when a.node_type = 'CM'
                then b.eid end as eid, a.node_type, a.node_name
            from guarantee_tag a
            left outer join (select eid, name from enterprises.hive_enterprises) b
            on a.node_name = b.name
            ''').distinct().createOrReplaceTempView("guarantee_tag_table")

        spark.sql('''
            select t1.*, case when t2.cnv_cny_exr is null then 1 else t2.cnv_cny_exr end as cnv_cny_exr
            from
            (select * from {0} where batch = '{2}') t1
            left outer join
            (select currency, cnv_cny_exr from {1} where dw_dat_dt = '{3}') t2
            on
                t1.currency = t2.currency
            '''.format(base_table_guarantee, table_exchange_rate, today_date,
                       yesterday_date)).createOrReplaceTempView(
                           'guarantee_rel_with_currency')

        spark.sql('''
            select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype,
            currency, guaranteeamount, sgn_dt, data_src, cast(cnv_cny_guaranteeamount as decimal(18,2)), cnv_cny_exr
            from
            (select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype,
            currency, guaranteeamount, sgn_dt, data_src, batch, cnv_cny_exr, 
            guaranteeamount * cnv_cny_exr as cnv_cny_guaranteeamount
            from guarantee_rel_with_currency)
            where grnt_custid != bor_custid
            ''').createOrReplaceTempView('guarantee_rel_table')

        spark.sql('cache table guarantee_tag_table')
        spark.sql('cache table guarantee_rel_table')

        v = spark.sql(
            'select node_id as id from guarantee_tag_table').distinct()

        v.persist(StorageLevel.MEMORY_AND_DISK_SER)

        e = spark.sql('''
            select grnt_custid as src , bor_custid as dst, 'guarantee' as relationship
            from guarantee_rel_table
            where grnt_custid != '' and bor_custid != '' and grnt_custid != bor_custid
            ''').distinct()

        e.persist(StorageLevel.MEMORY_AND_DISK_SER)

        graph = GraphFrame(v, e)
        graph.connectedComponents().createOrReplaceTempView('cc_df')
        e.createOrReplaceTempView('e')
        spark.sql('''
            select t2.src, t2.dst, cast(t1.component as string)
            from
            cc_df t1
            left outer join
            e t2
            on t1.id = t2.src
            where t2.src is not null and t2.dst is not null
            ''').distinct().createOrReplaceTempView('v_1')

        spark.sql('''
            select
                t1.src,
                t1.dst,
                named_struct(
                    "dw_stat_dt", cast(t2.dw_stat_dt as string),
                    "grnt_cod", t2.grnt_cod,
                    "grnt_custid", t2.grnt_custid,
                    "grnt_nm", t2.grnt_nm,
                    "bor_cod", t2.bor_cod,
                    "bor_custid", t2.bor_custid,
                    "bor_nm", t2.bor_nm,
                    "grnt_ctr_id", t2.grnt_ctr_id,
                    "guaranteetype", t2.guaranteetype,
                    "currency", t2.currency,
                    "guaranteeamount", cast(t2.guaranteeamount as string),
                    "sgn_dt", cast(t2.sgn_dt as string),
                    "data_src", t2.data_src,
                    "cnv_cny_guaranteeamount", cast(t2.cnv_cny_guaranteeamount as string)
                ) as attr,
                t1.component
            from
                (select * from v_1) t1
                left outer join
                (select * from guarantee_rel_table) t2
                on t1.src = t2.grnt_custid and t1.dst = t2.bor_custid
                where t2.grnt_custid is not null and t2.bor_custid is not null
            ''').createOrReplaceTempView('v_component_with_attr')

        spark.sql('''
            select named_struct(
            "link", array(src, dst),
            "attrs", attrs) as link,
            component
            from
            (select src, dst, collect_list(attr) as attrs, component
            from v_component_with_attr
            group by src, dst, component)
            ''').createOrReplaceTempView('v_component')

        print(
            '===============connected components finished===================')
def prepare_neo4j_data():
    base_table_guarantee = cfg.get('hivetable', 'guarantee_relation_base_table')
    base_table_custid_eid = cfg.get('hivetable', 'custid_eid_base_table')
    table_exchange_rate = cfg.get('hivetable', 'exchange_rate_table')
    table_enterprises = cfg.get('hivetable', 'enterprises_table')
    table_corp_map = cfg.get('hivetable', 'corp_map_table')

    yesterday_date = Utils.get_yesterday_date()
    grnt_latest_batch = str(spark.sql("select max(batch) batch from %s" % base_table_guarantee).take(1)[0]['batch'])
    custid_latest_batch = str(spark.sql("select max(batch) batch from %s" % base_table_custid_eid).take(1)[0]['batch'])
    exchange_max_batch = str(
        spark.sql("select cast(max(dw_dat_dt) as string) batch from %s" % table_exchange_rate).take(1)[0]['batch'])

    if exchange_max_batch == yesterday_date:
        exchange_batch = yesterday_date
    else:
        exchange_batch = exchange_max_batch

    scheme = StructType([
        StructField('corp_name', StringType(), True),
        StructField('corp_id', StringType(), True)
    ])
    spark.udf.register('parse_corp_info', Utils.parse_corp_info, scheme)
    spark.udf.register('gen_uuid', Utils.gen_uuid, StringType())
    spark.udf.register('json_to_string', Utils.json_to_string, StringType())

    # filter out data without customer id and name
    spark.sql(
        '''
        select * from {base_table_guarantee} where batch = '{batch}' and 
        !((grnt_custid = '' or grnt_custid is null) and (grnt_nm = '' or grnt_nm is null)) and 
        !((bor_custid = '' or bor_custid is null) and (bor_nm = '' or bor_nm is null))
        '''.format(base_table_guarantee=base_table_guarantee, batch=grnt_latest_batch)
    ).createOrReplaceTempView('base_guarantee_relation')

    # === Nodes start ===

    spark.sql(
        '''
        select t1.grnt_cod as node_id, t2.eid, 
            case when t1.grnt_custid is not null then substr(t1.grnt_custid, 0, 2) else 'UNKNOWN' end as node_type,
            t1.grnt_nm as node_name
        from 
        (select * from base_guarantee_relation where batch = '{grnt_batch}') t1
        left outer join
        (select * from {t_custid_eid} where batch = '{custid_batch}') t2
        on t1.grnt_custid = t2.cust_id
        union
        select t3.bor_cod as node_id, t4.eid, 
            case when t3.bor_custid is not null then substr(t3.bor_custid, 0, 2) else 'UNKNOWN' end as node_type,
            t3.bor_nm as node_name
        from
        (select * from base_guarantee_relation where batch = '{grnt_batch}') t3
        left outer join
        (select * from {t_custid_eid} where batch = '{custid_batch}') t4
        on t3.bor_custid = t4.cust_id
        '''.format(grnt_batch=grnt_latest_batch, t_custid_eid=base_table_custid_eid, custid_batch=custid_latest_batch)
    ).createOrReplaceTempView('t_nodes')

    # Supplement eid for nodes by fully matching company name
    spark.sql(
        '''
        select
            a.node_id,
            case
                when a.eid is not null then a.eid
                when a.eid is null and a.node_type = 'CM' then b.eid
                when a.eid is null and a.node_type = 'UNKNOWN' then b.eid
            end as eid,
            a.node_type,
            a.node_name
        from
        t_nodes a
        left outer join
        (select eid, name from {t_enterprises}) b
        on a.node_name = b.name
        '''.format(t_enterprises=table_enterprises)).distinct().createOrReplaceTempView('t_nodes_eid')

    # Add corporation tag for nodes
    spark.sql(
        '''
        select t.eid, t.corp_info.corp_name corp_name, t.corp_info.corp_id corp_id
         from
         (select `qixinbao_id|string` eid, parse_corp_info(`corp_list|string`) corp_info from %s) t
        ''' % table_corp_map).createOrReplaceTempView('t_corp')
    spark.sql(
        '''
        select a.*, b.corp_name, b.corp_id
        from
        t_nodes_eid a
        left outer join
        t_corp b
        on a.eid = b.eid
        ''').createOrReplaceTempView('guarantee_corp')

    spark.sql(
        '''
        select corp_id, count(*) corp_cnt, concat_ws(';', (collect_set(node_id))) corp_list
        from guarantee_corp
        where corp_id is not null
        group by corp_id
        ''').createOrReplaceTempView('corp_info')

    spark.sql(
        '''
        select a.*, b.corp_cnt
        from
        guarantee_corp a
        left outer join
        corp_info b
        on a.corp_id = b.corp_id
        ''').createOrReplaceTempView('guarantee_nodes')

    spark.sql(
        '''
        select b.*, a.corp_name, b.corp_id node_id
        from
        (select corp_id, corp_name from guarantee_corp where corp_id is not null) a
        left outer join
        corp_info b
        on a.corp_id = b.corp_id
        where b.corp_id is not null and b.corp_cnt > 1
        ''').distinct().createOrReplaceTempView('corp_nodes')

    # === Nodes end ===

    # === Edges start ===

    spark.sql(
        '''
        select t1.*, case when t2.cnv_cny_exr is null then 1 else t2.cnv_cny_exr end as cnv_cny_exr
        from
        (select * from base_guarantee_relation where batch = '{grnt_batch}') t1
        left outer join
        (select currency, cnv_cny_exr from {t_ex_rate} where dw_dat_dt = '{exchange_batch}') t2
        on t1.currency = t2.currency
        '''.format(grnt_batch=grnt_latest_batch, t_ex_rate=table_exchange_rate, exchange_batch=exchange_batch)
    ).createOrReplaceTempView('t_rel')

    spark.sql(
        '''
        select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype,
            currency, guaranteeamount, sgn_dt, data_src, cast(cnv_cny_guaranteeamount as decimal(18,2)), 
            cast(cnv_cny_exr as decimal(18, 2))
        from
        (select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype,
            currency, guaranteeamount, sgn_dt, data_src, batch, cnv_cny_exr, 
            guaranteeamount * cnv_cny_exr as cnv_cny_guaranteeamount
        from t_rel)
        where grnt_cod != bor_cod
        ''').createOrReplaceTempView('t_rel_cny')

    spark.sql(
        '''
        select grnt_cod, bor_cod, grnt_nm, bor_nm, map(
            "dw_stat_dt", cast(dw_stat_dt as string), 
            "grnt_custid", grnt_custid,
            "bor_custid", bor_custid,
            "grnt_ctr_id", grnt_ctr_id,
            "guaranteetype", guaranteetype, 
            "currency", currency, 
            "guaranteeamount", guaranteeamount, 
            "sgn_dt", cast(sgn_dt as string), 
            "data_src", data_src,
            "cnv_cny_guaranteeamount", cnv_cny_guaranteeamount,
            "cnv_cny_exr",cnv_cny_exr
        ) as attr, cnv_cny_guaranteeamount
        from t_rel_cny
        ''').createOrReplaceTempView('t_rel_concat_attr')

    spark.sql(
        '''
        select grnt_cod, bor_cod, grnt_nm as src_name, bor_nm as dst_name, 
            concat_ws(";", collect_list(json_to_string(attr))) as attrs, count(*) as times,
            sum(cnv_cny_guaranteeamount) as amount
        from t_rel_concat_attr
        group by grnt_cod, bor_cod, grnt_nm, bor_nm
        ''').createOrReplaceTempView('t_grnt_rel')

    # build graph
    v = spark.sql('select node_id as id from guarantee_nodes').distinct()

    v.persist(StorageLevel.MEMORY_AND_DISK_SER)

    e = spark.sql(
        '''
        select grnt_cod as src , bor_cod as dst, 'guarantee' as relationship
        from t_rel_cny
        where grnt_cod != '' and bor_cod != '' and grnt_cod != bor_cod
        ''').distinct()

    e.persist(StorageLevel.MEMORY_AND_DISK_SER)

    sys.path.append(cfg.get('graphframes', 'graphframes_path'))
    from graphframe import GraphFrame
    graph = GraphFrame(v, e)
    graph.find('(a)-[e1]->(b); (b)-[e2]->(a)') \
        .selectExpr('a.id src', 'b.id dst', 'gen_uuid(a.id, b.id) circle_id', '2 rel_type') \
        .createOrReplaceTempView('each_other')

    # Add attribute for edges
    spark.sql(
        '''
        select a.*, b.circle_id, case when b.rel_type is not null then b.rel_type else 1 end as rel_type
        from
        t_grnt_rel a
        left outer join
        each_other b
        on (a.grnt_cod = b.src and a.bor_cod = b.dst)
        ''').createOrReplaceTempView('guarantee_edges')

    spark.sql(
        '''
        select node_id, corp_id, node_name, corp_name
        from guarantee_nodes
        where corp_id is not null and corp_cnt > 1
        ''').distinct().createOrReplaceTempView('belong_rel')

    spark.sql('select grnt_cod src, bor_cod dst from guarantee_edges').createOrReplaceTempView('grnt_rel')

    spark.sql(
        '''
        select a.node_id src, a.corp_id src_corp_id, c.node_id dst, c.corp_id dst_corp_id, a.corp_name as src_name,
            c.corp_name as dst_name
        from
        belong_rel a
        left outer join
        grnt_rel b
        on a.node_id = b.src
        left outer join
        belong_rel c
        on b.dst = c.node_id
        where c.node_id is not null
        ''').createOrReplaceTempView('corp_rel_temp')

    spark.sql(
        '''
        select a.*, b.times, b.amount
        from
        corp_rel_temp a
        left outer join
        guarantee_edges b
        on a.src = b.grnt_cod and a.dst = b.bor_cod
        where b.grnt_cod is not null and b.bor_cod is not null
        ''').createOrReplaceTempView('corp_rel_detail')

    spark.sql(
        '''
        select src_corp_id, dst_corp_id, src_name, dst_name, sum(amount) amount, sum(times) times
        from corp_rel_detail
        group by src_corp_id, dst_corp_id, src_name, dst_name
        ''').createOrReplaceTempView('corp_rel')

    # edges for guarantee entity and corp entity
    v_2 = spark.sql(
        '''
        select node_id as id, 'guarantee' as type from guarantee_nodes
        union 
        select corp_id as id, 'corp' as type from corp_nodes where corp_cnt > 1
        ''').distinct()

    v_2.persist(StorageLevel.MEMORY_AND_DISK_SER)

    e_2 = spark.sql(
        '''
        select grnt_cod as src , bor_cod as dst, 'guarantee_rel' as relationship, grnt_nm as name_a, 
            bor_nm as name_b
        from t_rel_cny
        where grnt_cod != '' and bor_cod != '' and grnt_cod != bor_cod
        union
        select node_id as src, corp_id as dst, 'belong_rel' as relationship, node_name as name_a, corp_name as name_b
        from belong_rel
        ''').distinct()

    e_2.persist(StorageLevel.MEMORY_AND_DISK_SER)

    graph_2 = GraphFrame(v_2, e_2)
    graph_2.find('(a)-[e1]->(b); (b)-[e2]->(c)')\
        .filter('e1.relationship = "guarantee_rel" and e2.relationship = "belong_rel"')\
        .selectExpr('a.id as entity_id', 'b.id as entity_fd_id', 'c.id as corp_id', 'e1.name_a as src_name', 'e2.name_b as dst_name')\
        .createOrReplaceTempView('entity_to_corp_part1')

    graph_2.find('(b)-[e1]->(a); (b)-[e2]->(c)') \
        .filter('e1.relationship = "guarantee_rel" and e2.relationship = "belong_rel"') \
        .selectExpr('a.id as entity_id', 'b.id as entity_fd_id', 'c.id as corp_id', 'e2.name_b as src_name', 'e1.name_b as dst_name') \
        .createOrReplaceTempView('entity_to_corp_part2')

    spark.sql(
        '''
        select a.entity_fd_id, b.times, b.amount, a.entity_id src, a.corp_id dst, a.src_name, a.dst_name
        from
        entity_to_corp_part1 a
        left outer join
        guarantee_edges b
        on a.entity_id = b.grnt_cod and a.entity_fd_id = b.bor_cod
        where b.grnt_cod is not null and b.bor_cod is not null
        union
        select a.entity_fd_id, b.times, b.amount, a.corp_id src, a.entity_id dst, a.src_name, a.dst_name
        from
        entity_to_corp_part2 a
        left outer join
        guarantee_edges b
        on a.entity_fd_id = b.grnt_cod and a.entity_id = b.bor_cod
        where b.grnt_cod is not null and b.bor_cod is not null
        ''').createOrReplaceTempView('entity_to_corp_info')

    spark.sql(
        '''
        select src, dst, src_name, dst_name, collect_set(entity_fd_id) as first_degree_ids, sum(times) as times, 
            sum(amount) as amount
        from entity_to_corp_info
        group by src, dst, src_name, dst_name
        ''').createOrReplaceTempView('entity_to_corp_agg')

    spark.sql(
        '''
        select a.entity_id src, a.entity_fd_id dst
        from
        entity_to_corp_part1 a
        left outer join
        guarantee_edges b
        on a.entity_id = b.grnt_cod and a.entity_fd_id = b.bor_cod
        where b.grnt_cod is not null and b.bor_cod is not null
        union
        select a.entity_fd_id src, a.entity_id dst
        from
        entity_to_corp_part2 a
        left outer join
        guarantee_edges b
        on a.entity_fd_id = b.grnt_cod and a.entity_id = b.bor_cod
        where b.grnt_cod is not null and b.bor_cod is not null
        ''').createOrReplaceTempView('entity_related_to_corp')

    spark.sql(
        '''
        select a.*
        from
        guarantee_edges a
        left outer join
        entity_related_to_corp b
        on a.grnt_cod = b.src and a.bor_cod = b.dst
        where b.src is null and b.dst is null
        ''').createOrReplaceTempView('entity_to_entity')

    # === Edges end ===

    # Nodes CSV
    nodes_guarantee_csv = cfg.get('csv', 'nodes_guarantee')
    nodes_corp_csv = cfg.get('csv', 'nodes_corp')
    # Edges CSV
    edges_guarantee_csv = cfg.get('csv', 'edges_guarantee')
    edges_belong_csv = cfg.get('csv', 'edges_belong')
    edges_corp_csv = cfg.get('csv', 'edges_corp')
    edges_corp_agg_csv = cfg.get('csv', 'edges_corp_agg')

    # Filter null value

    spark.sql(
        '''
        select node_id `node_id:ID`, eid, node_type, node_name, corp_name, corp_id, corp_cnt
        from guarantee_nodes
        ''').write.mode('overwrite').option('sep', '\001').csv(nodes_guarantee_csv)
    logging.info('Write nodes guarantee csv finished!')

    spark.sql(
        '''
        select corp_id `corp_id:ID`, corp_cnt, corp_list `corp_list:string[]`, corp_name, node_id, 'CO' node_type
        from corp_nodes
        ''').write.mode('overwrite').option('sep', '\001').csv(nodes_corp_csv)
    logging.info('Write nodes corp csv finished!')

    spark.sql(
        '''
        select grnt_cod `grnt_cod:START_ID`, bor_cod `bor_cod:END_ID`, grnt_cod src, bor_cod dst,
            attrs `attrs:string[]`, case when circle_id is null then '' else circle_id end circle_id, rel_type, times,
            amount, src_name, dst_name
        from guarantee_edges
        ''').write.mode('overwrite').option('sep', '\001').csv(edges_guarantee_csv)
    logging.info('Write edges guarantee csv finished!')

    spark.sql(
        '''
        select node_id `node_id:START_ID`, corp_id `corp_id:END_ID`, node_name src_name, corp_name dst_name
        from belong_rel
        ''').write.mode('overwrite').option('sep', '\001').csv(edges_belong_csv)
    logging.info('Write edges belong csv finished!')

    spark.sql(
        '''
        select src_corp_id `src_corp_id:START_ID`, dst_corp_id `dst_corp_id:END_ID`, src_corp_id src, dst_corp_id dst, 
            '' as circle_id, 4 as rel_type, amount, times, src_name, dst_name
        from corp_rel
        ''').write.mode('overwrite').option('sep', '\001').csv(edges_corp_csv)
    logging.info('Write edges corp csv finished!')

    spark.sql(
        '''
        select src `src:START_ID`, dst `dst:END_ID`, src, dst, concat_ws(";", first_degree_ids) `node_ids:string[]`,
            times, amount, 3 rel_type, src_name, dst_name
        from entity_to_corp_agg
        ''').write.mode('overwrite').option('sep', '\001').csv(edges_corp_agg_csv)
    logging.info('Write edges corp agg csv finished!')
Example #6
0
def connected_components(base_table_guarantee, base_table_custid_eid,
                         table_exchange_rate):
    grnt_latest_batch = str(
        spark.sql("select max(batch) batch from %s" %
                  base_table_guarantee).take(1)[0]['batch'])
    custid_latest_batch = str(
        spark.sql("select max(batch) batch from %s" %
                  base_table_custid_eid).take(1)[0]['batch'])
    exchange_max_batch = str(
        spark.sql("select cast(max(dw_dat_dt) as string) batch from %s" %
                  table_exchange_rate).take(1)[0]['batch'])

    if exchange_max_batch == yesterday_date:
        exchange_batch = yesterday_date
    else:
        exchange_batch = exchange_max_batch

    # filter out data without customer id and name
    spark.sql('''
        select * from {base_table_guarantee} where batch = '{batch}' and 
        !((grnt_custid = '' or grnt_custid is null) and (grnt_nm = '' or grnt_nm is null)) and 
        !((bor_custid = '' or bor_custid is null) and (bor_nm = '' or bor_nm is null))
        '''.format(base_table_guarantee=base_table_guarantee,
                   batch=grnt_latest_batch)).createOrReplaceTempView(
                       'base_guarantee_relation')

    spark.sql('''
        select t1.grnt_cod as node_id, t2.eid, 
            case when t1.grnt_custid is not null then substr(t1.grnt_custid, 0, 2) else 'UNKNOWN' end as node_type, 
            t1.grnt_nm as node_name
        from 
        (select * from base_guarantee_relation where batch = '{grnt_batch}') t1
        left outer join
        (select * from {t_custid_eid} where batch = '{custid_batch}') t2
        on t1.grnt_custid = t2.cust_id
        union
        select t3.bor_cod as node_id, t4.eid, 
            case when t3.bor_custid is not null then substr(t3.bor_custid, 0, 2) else 'UNKNOWN' end as node_type, 
            t3.bor_nm as node_name
        from
        (select * from base_guarantee_relation where batch = '{grnt_batch}') t3
        left outer join
        (select * from {t_custid_eid} where batch = '{custid_batch}') t4
        on t3.bor_custid = t4.cust_id
        '''.format(grnt_batch=grnt_latest_batch,
                   t_custid_eid=base_table_custid_eid,
                   custid_batch=custid_latest_batch)).createOrReplaceTempView(
                       'guarantee_tag')

    spark.sql('''
        select
            a.node_id,
            case
                when a.eid is not null then a.eid
                when a.eid is null and a.node_type = 'CM' then b.eid
                when a.eid is null and a.node_type = 'UNKNOWN' then b.eid
            end as eid,
            a.node_type,
            a.node_name
        from guarantee_tag a
        left outer join
        (select eid, name from enterprises.hive_enterprises) b
        on a.node_name = b.name
        ''').distinct().createOrReplaceTempView("guarantee_tag_table")

    spark.sql('''
        select t1.*, case when t2.cnv_cny_exr is null then 1 else t2.cnv_cny_exr end as cnv_cny_exr
        from
        (select * from base_guarantee_relation where batch = '{grnt_batch}') t1
        left outer join
        (select currency, cnv_cny_exr from {t_ex_rate} where dw_dat_dt = '{exchange_batch}') t2
        on
            t1.currency = t2.currency
        '''.format(grnt_batch=grnt_latest_batch,
                   t_ex_rate=table_exchange_rate,
                   exchange_batch=exchange_batch)).createOrReplaceTempView(
                       'guarantee_rel_with_currency')

    spark.sql('''
        select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype,
            currency, guaranteeamount, sgn_dt, data_src, cast(cnv_cny_guaranteeamount as decimal(18,2)), 
            cast(cnv_cny_exr as decimal(18, 2))
        from
        (select dw_stat_dt, grnt_cod, grnt_custid, grnt_nm, bor_cod, bor_custid, bor_nm, grnt_ctr_id, guaranteetype,
            currency, guaranteeamount, sgn_dt, data_src, batch, cnv_cny_exr, 
            guaranteeamount * cnv_cny_exr as cnv_cny_guaranteeamount
        from guarantee_rel_with_currency)
        ''').createOrReplaceTempView('guarantee_rel_table')

    spark.sql('cache table guarantee_tag_table')
    spark.sql('cache table guarantee_rel_table')

    v = spark.sql('select node_id as id from guarantee_tag_table').distinct()

    v.persist(StorageLevel.MEMORY_AND_DISK_SER)

    e = spark.sql('''
        select grnt_cod as src , bor_cod as dst, 'guarantee' as relationship
        from guarantee_rel_table
        where grnt_cod != '' and bor_cod != '' and grnt_cod != bor_cod
        ''').distinct()

    e.persist(StorageLevel.MEMORY_AND_DISK_SER)

    graph = GraphFrame(v, e)
    graph.connectedComponents().createOrReplaceTempView('cc_df')
    cc_count = spark.table('cc_df').select('component').distinct().count()
    logging.info('Connected components finished! Count: %d' % cc_count)
    e.createOrReplaceTempView('e')
    spark.sql('''
        select t2.src, t2.dst, cast(t1.component as string)
        from
        cc_df t1
        left outer join
        e t2
        on t1.id = t2.src
        where t2.src is not null and t2.dst is not null
        ''').distinct().createOrReplaceTempView('v_1')

    spark.sql('''
        select
            t1.src,
            t1.dst,
            named_struct(
                "dw_stat_dt", cast(t2.dw_stat_dt as string),
                "grnt_cod", t2.grnt_cod,
                "grnt_custid", t2.grnt_custid,
                "grnt_nm", t2.grnt_nm,
                "bor_cod", t2.bor_cod,
                "bor_custid", t2.bor_custid,
                "bor_nm", t2.bor_nm,
                "grnt_ctr_id", t2.grnt_ctr_id,
                "guaranteetype", t2.guaranteetype,
                "currency", t2.currency,
                "guaranteeamount", cast(t2.guaranteeamount as string),
                "sgn_dt", cast(t2.sgn_dt as string),
                "data_src", t2.data_src,
                "cnv_cny_guaranteeamount", cast(t2.cnv_cny_guaranteeamount as string)
            ) as attr,
            t1.component
        from
            (select * from v_1) t1
            left outer join
            (select * from guarantee_rel_table) t2
            on t1.src = t2.grnt_cod and t1.dst = t2.bor_cod
            where t2.grnt_cod is not null and t2.bor_cod is not null
        ''').createOrReplaceTempView('v_component_with_attr')

    spark.sql('''
        select named_struct(
        "link", array(src, dst),
        "attrs", attrs) as link,
        component
        from
        (select src, dst, collect_list(attr) as attrs, component
        from v_component_with_attr
        group by src, dst, component)
        ''').createOrReplaceTempView('v_component')

    spark.sql('''
        select component, collect_list(link) as links
        from v_component
        group by component
        ''').createOrReplaceTempView('v_links')

    spark.sql('''select component, links from v_links'''
              ).createOrReplaceTempView('grnt_graph')
    spark.table('grnt_graph').write.mode('overwrite').json(
        BASE_SPARK.get_hdfs_graph_file_path())
    logging.info('Write graph file finished')

    spark.sql('''
        select explode(t2.link) as node_id
        from
        grnt_graph t1
        left outer join
        (select link.link as link, component from v_component) t2
        on t1.component = t2.component
        where t2.component is not null
        ''').distinct().createOrReplaceTempView('graph_nodes')

    spark.sql('''
        select t2.node_id, named_struct("node_id", t2.node_id, "eid", t2.eid, "node_type", t2.node_type,
            "node_name", t2.node_name) as node
        from
        graph_nodes t1
        left outer join
        guarantee_tag_table t2
        on t1.node_id = t2.node_id
        where t2.node_id is not null and t2.node_id != ''
        ''').createOrReplaceTempView('graph_nodes_with_attr')
    spark.table('graph_nodes_with_attr').write.mode('overwrite').json(
        BASE_SPARK.get_hdfs_nodes_file_path())
    logging.info('Write nodes file finished')
Example #7
0
 def onView(self,event):
     """Display a graph of the spectrum."""
     x,y,e = self.calcSpec()
     graph = GraphFrame(self,"Spectrum")
     graph.plot(x,y,range=(self.vmin,self.vmax),yerr=e)
     self.Show(False)