Esempio n. 1
0
def _F_Clsuter_Geo():
    if os.path.exists(data_path + "data/_F_geo.feather"):
        df = feather.read_dataframe(data_path + "data/_F_geo.feather")
        return df
    # 合并 operation和transaction的uid,geo_code
    geo_info = pd.concat(
        (op_info[[pri_id, 'geo_code']], trans_info[[pri_id, 'geo_code']]))
    geo_info['pos'] = geo_info['geo_code'].apply(_F.Decode)

    temp = geo_info[geo_info['pos'] != -1]

    res = [x for x in temp['pos'].values]
    X = np.asarray(res)
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=20, random_state=2018).fit(X)
    temp['cluster_id'] = kmeans.labels_

    t = temp.groupby([
        'UID', 'cluster_id'
    ])['pos'].count().reset_index().rename(columns={'pos': 'cluster_count'})
    c = pd.pivot_table(t,
                       index='UID',
                       columns='cluster_id',
                       values='cluster_count').fillna(0).reset_index()
    # 重命名列
    _Prep = _P.Process()
    c = _Prep.RenameColumns(c, [pri_id], 'cluster')
    # 持久化
    feather.write_dataframe(c, data_path + "data/_F_geo.feather")
    return c
Esempio n. 2
0
def _F_GeoCode(encode_type="LabelEncode", n=3):
    if os.path.exists(data_path + "data/_F_geo_code.feather"):
        df = feather.read_dataframe(data_path + "data/_F_geo_code.feather")
        return df
    # 取每个用户经常活跃的topN geo_code
    geo_info = pd.concat(
        (op_info[[pri_id, 'geo_code',
                  'day']], trans_info[[pri_id, 'geo_code', 'day']]))
    temp = _F.TopNGeo_code(geo_info, pri_id, 'day', n)
    # 编码
    _Prep = _P.Process()
    temp = _Prep.CatColConvert(temp, pri_id, encode_type)
    # 持久化
    feather.write_dataframe(temp, data_path + "data/_F_geo_code.feather")
    return temp
Esempio n. 3
0
def CatRowsToCols(df,pri_id,col,other_col):
    temp = df.groupby([pri_id,col])[other_col].count().reset_index().rename(columns={other_col:"count"})
    temp = pd.DataFrame(temp.pivot_table(index=pri_id,columns=col,values="count").reset_index())
    _P = _Prep.Process()
    temp = _P.RenameColumns(temp,[pri_id],col)
    return temp
Esempio n. 4
0
def Base_Process(encode_type='LabelEncode'):
    if os.path.exists(data_path + "data/data.feather"):
        df = feather.read_dataframe(data_path + "data/data.feather")
        return df

    data = pd.DataFrame()
    data[pri_id] = pd.concat((_train[pri_id], _test[pri_id]))

    # 这几列的值要按pri_id相加

    cols = [
        'device2', 'ip1', 'mac1', 'device_code1', 'device_code2',
        'device_code3'
    ]
    cols = [col + "_count" for col in cols]
    op = ProcessOperation(op_info)
    trans = ProcessTrans(trans_info)

    temp = pd.DataFrame()
    temp[cols] = op[cols] + trans[cols]

    c_cols = [
        'device_code1', 'device_code2', 'device_code3', 'geo_code', 'mac1',
        'ip1', 'ip1_sub'
    ]
    c_cols = [col + "_d_label" for col in c_cols]

    temp_c = pd.DataFrame()
    temp_c[c_cols] = op[c_cols] + trans[c_cols]

    # 对op和trans列名重新编码,防止重复名称
    _Prep = _P.Process()

    cols.append(pri_id)
    op = _Prep.RenameColumns(op, cols, 'op')
    trans = _Prep.RenameColumns(trans, cols, 'trans')

    cols.remove(pri_id)

    op_cols = [
        col for col in op.columns if col not in cols and col not in c_cols
    ]
    tran_cols = [
        col for col in trans.columns if col not in cols and col not in c_cols
    ]

    # op_cols = [col for col in op.columns if col not in cols]
    # tran_cols = [col for col in trans.columns if col not in cols]

    data = pd.merge(data, op[op_cols], on=pri_id, how='left')

    data = pd.merge(data, trans[tran_cols], on=pri_id, how='left')

    # 连接temp
    data = pd.concat((data, temp), axis=1)
    data = pd.concat((data, temp_c), axis=1)

    # 加入label
    data = pd.merge(data, label, on=pri_id, how='left')
    data.rename(columns={'Tag': 'y'}, inplace=True)

    # 编码
    _Prep = _P.Process()
    data = _Prep.CatColConvert(data, pri_id, encode_type)

    data = data.fillna(0)

    # 持久化
    feather.write_dataframe(data, data_path + "data/data.feather")
    return data