Ejemplo n.º 1
0
def Process1(df):
    pri_id = "企业名称"
    res = pd.DataFrame()
    res[pri_id] = df[pri_id].unique()
    # 转换币种
    df = prep.Convert_money(df)
    # 提取注册资金特征(最大值,最小值,均值,方差)
    res = pd.merge(res,fea.GetValAvg(df,pri_id,"注册资金(元)"),on=pri_id)
    res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"注册资金(元)"),on=pri_id)
    res = pd.merge(res,fea.GetValVar(df,pri_id,"注册资金(元)"),on=pri_id)

    # 提取类别特征
    num_fea = ['注册资金(元)',"出资比例"]
    cat_fea = [col for col in df.columns if col != pri_id and col not in num_fea]
    for col in cat_fea:
        res = pd.merge(res,fea.GetCategroicalCount(df,pri_id,col),on=pri_id)

    # 法定代表人和首席代表标志为空统计
    res = pd.merge(res,fea.GetValNaCount(df,pri_id,"法定代表人标志","姓名"),on=pri_id)
    res = pd.merge(res,fea.GetValNaCount(df,pri_id,"首席代表标志","姓名"),on=pri_id)

    # 统计 相应职务个树
    res = pd.merge(res,fea.CatRowsToCols(df,pri_id,"职务","姓名"))

    # 提取出资比例(最大值,最小值,均值,方差)
    res = pd.merge(res,fea.GetValAvg(df,pri_id,"出资比例"),on=pri_id)
    res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"出资比例"),on=pri_id)
    res = pd.merge(res,fea.GetValVar(df,pri_id,"出资比例"),on=pri_id)

    return res
Ejemplo n.º 2
0
def ProcessOperation(df):

    data = pd.DataFrame()
    data[pri_id] = pd.concat((_train[pri_id], _test[pri_id]))
    # day处理 (分为上 中 下旬)
    temp = _F.MonthCount(df, pri_id)
    data = pd.merge(data, temp, on=pri_id, how='left')

    # mode
    temp = _F.CatRowsToCols(df, pri_id, 'mode', 'os')
    data = pd.merge(data, temp, on=pri_id, how='left')

    # success
    temp = _F.CatRowsToCols(df, pri_id, 'success', 'os')
    data = pd.merge(data, temp, on=pri_id, how='left')

    # version
    temp = _F.CatRowsToCols(df, pri_id, 'version', 'os')
    data = pd.merge(data, temp, on=pri_id, how='left')

    # time
    df['day_period'] = df['time'].apply(_F.TimeInterval)
    temp = _F.CatRowsToCols(df, pri_id, 'day_period', 'os')
    data = pd.merge(data, temp, on=pri_id, how='left')

    cols = [
        'device2', 'ip1', 'ip2', 'mac1', 'mac2', 'device_code1',
        'device_code2', 'device_code3'
    ]

    for col in cols:
        # device2 (用户有多少不同型号的设备)
        temp = _F.GetCount(df, pri_id, col, 'os')
        data = pd.merge(data, temp, on=pri_id, how='left')

    # 统计地理位置次数
    temp = _F.CountWS(df)
    data = pd.merge(data, temp, on=pri_id, how='left')
    #
    # # 危险设备(安卓)
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','device_code1','time',500)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 危险设备(安卓)
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','device_code2','time',500)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    #
    # # 危险设备(苹果)
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','device_code3','time',200)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险地理位置
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','geo_code','time',500)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险操作类型
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','mode','time',10000)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险mac1地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','mac1','time',600)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险mac2地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','mac2','time',1000)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险ip1地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','ip1','time',1000)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险ip1_sub地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','ip1_sub','time',1000)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险ip2_sub地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','ip2_sub','time',50)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计每个用户最常出现的经纬度
    # temp = _F.PositionWS(df)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 地理位置聚类

    data = data.fillna(0)
    return data
Ejemplo n.º 3
0
def ProcessTrans(df):
    data = pd.DataFrame()
    data[pri_id] = pd.concat((_train[pri_id], _test[pri_id]))

    # channel 统计次数
    temp = _F.CatRowsToCols(df, pri_id, 'channel', 'day')
    data = pd.merge(data, temp, on=pri_id, how='left')

    # day
    temp = _F.MonthCount(df, pri_id)
    data = pd.merge(data, temp, on=pri_id, how='left')

    # time
    df['day_period'] = df['time'].apply(_F.TimeInterval)
    temp = _F.CatRowsToCols(df, pri_id, 'day_period', 'day')
    data = pd.merge(data, temp, on=pri_id, how='left')

    # trans_amt (交易金额)(最大值,最小值,平均值)
    # temp = _F.GetValMaxMin(df,pri_id,'trans_amt')
    # data = pd.merge(data,temp,on=pri_id,how='left')
    # temp = _F.GetValAvg(df,pri_id,'trans_amt')
    # data = pd.merge(data,temp,on=pri_id,how='left')
    # temp = _F.GetValSum(df,pri_id,'trans_amt')
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 计算不同的次数
    cols = [
        'device2', 'ip1', 'mac1', 'device_code1', 'device_code2',
        'device_code3', 'amt_src1', 'amt_src2', 'merchant', 'trans_type1',
        'trans_type2', 'acc_id1', 'market_type', 'market_code'
    ]
    for col in cols:
        temp = _F.GetCount(df, pri_id, col, 'day')
        data = pd.merge(data, temp, on=pri_id, how='left')

    # 脱敏后的余额(最大值,最小值,平均值)
    # temp = _F.GetValMaxMin(df,pri_id,'bal')
    # data = pd.merge(data,temp,on=pri_id,how='left')
    # temp = _F.GetValAvg(df,pri_id,'bal')
    # data = pd.merge(data,temp,on=pri_id,how='left')
    # temp = _F.GetValSum(df,pri_id,'bal')
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 危险设备(安卓)
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','device_code1','time',100)
    # data = pd.merge(data,temp,on=pri_id,how='left')
    #
    # # 危险设备(安卓)
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','device_code2','time',100)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 危险设备(苹果)
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','device_code3','time',80)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险地理位置
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','geo_code','time',200)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险商家
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','merchant','time',1000)
    # data = pd.merge(data,temp,on=pri_id,how='left')
    #
    # # 统计地理位置次数
    temp = _F.CountWS(df)
    data = pd.merge(data, temp, on=pri_id, how='left')
    #
    # 统计危险mac1地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','mac1','time',600)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险ip1地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','ip1','time',1000)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险ip1_sub地址
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','ip1_sub','time',1000)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计危险交易账户
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','acc_id1','time',200)
    # data = pd.merge(data,temp,on=pri_id,how='left')
    #
    # # 统计危险转出账户
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','acc_id2','time',100)
    # data = pd.merge(data,temp,on=pri_id,how='left')
    #
    # # 统计危险转入账户
    # temp = _F.CountDangerous(df,label,pri_id,'Tag','acc_id3','time',100)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    # 统计每个用户最常出现的经纬度
    # temp = _F.PositionWS(df)
    # data = pd.merge(data,temp,on=pri_id,how='left')

    data = data.fillna(0)
    return data