def _F_Clsuter_Geo(): if os.path.exists(data_path + "data/_F_geo.feather"): df = feather.read_dataframe(data_path + "data/_F_geo.feather") return df # 合并 operation和transaction的uid,geo_code geo_info = pd.concat( (op_info[[pri_id, 'geo_code']], trans_info[[pri_id, 'geo_code']])) geo_info['pos'] = geo_info['geo_code'].apply(_F.Decode) temp = geo_info[geo_info['pos'] != -1] res = [x for x in temp['pos'].values] X = np.asarray(res) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=20, random_state=2018).fit(X) temp['cluster_id'] = kmeans.labels_ t = temp.groupby([ 'UID', 'cluster_id' ])['pos'].count().reset_index().rename(columns={'pos': 'cluster_count'}) c = pd.pivot_table(t, index='UID', columns='cluster_id', values='cluster_count').fillna(0).reset_index() # 重命名列 _Prep = _P.Process() c = _Prep.RenameColumns(c, [pri_id], 'cluster') # 持久化 feather.write_dataframe(c, data_path + "data/_F_geo.feather") return c
def _F_GeoCode(encode_type="LabelEncode", n=3): if os.path.exists(data_path + "data/_F_geo_code.feather"): df = feather.read_dataframe(data_path + "data/_F_geo_code.feather") return df # 取每个用户经常活跃的topN geo_code geo_info = pd.concat( (op_info[[pri_id, 'geo_code', 'day']], trans_info[[pri_id, 'geo_code', 'day']])) temp = _F.TopNGeo_code(geo_info, pri_id, 'day', n) # 编码 _Prep = _P.Process() temp = _Prep.CatColConvert(temp, pri_id, encode_type) # 持久化 feather.write_dataframe(temp, data_path + "data/_F_geo_code.feather") return temp
def CatRowsToCols(df,pri_id,col,other_col): temp = df.groupby([pri_id,col])[other_col].count().reset_index().rename(columns={other_col:"count"}) temp = pd.DataFrame(temp.pivot_table(index=pri_id,columns=col,values="count").reset_index()) _P = _Prep.Process() temp = _P.RenameColumns(temp,[pri_id],col) return temp
def Base_Process(encode_type='LabelEncode'): if os.path.exists(data_path + "data/data.feather"): df = feather.read_dataframe(data_path + "data/data.feather") return df data = pd.DataFrame() data[pri_id] = pd.concat((_train[pri_id], _test[pri_id])) # 这几列的值要按pri_id相加 cols = [ 'device2', 'ip1', 'mac1', 'device_code1', 'device_code2', 'device_code3' ] cols = [col + "_count" for col in cols] op = ProcessOperation(op_info) trans = ProcessTrans(trans_info) temp = pd.DataFrame() temp[cols] = op[cols] + trans[cols] c_cols = [ 'device_code1', 'device_code2', 'device_code3', 'geo_code', 'mac1', 'ip1', 'ip1_sub' ] c_cols = [col + "_d_label" for col in c_cols] temp_c = pd.DataFrame() temp_c[c_cols] = op[c_cols] + trans[c_cols] # 对op和trans列名重新编码,防止重复名称 _Prep = _P.Process() cols.append(pri_id) op = _Prep.RenameColumns(op, cols, 'op') trans = _Prep.RenameColumns(trans, cols, 'trans') cols.remove(pri_id) op_cols = [ col for col in op.columns if col not in cols and col not in c_cols ] tran_cols = [ col for col in trans.columns if col not in cols and col not in c_cols ] # op_cols = [col for col in op.columns if col not in cols] # tran_cols = [col for col in trans.columns if col not in cols] data = pd.merge(data, op[op_cols], on=pri_id, how='left') data = pd.merge(data, trans[tran_cols], on=pri_id, how='left') # 连接temp data = pd.concat((data, temp), axis=1) data = pd.concat((data, temp_c), axis=1) # 加入label data = pd.merge(data, label, on=pri_id, how='left') data.rename(columns={'Tag': 'y'}, inplace=True) # 编码 _Prep = _P.Process() data = _Prep.CatColConvert(data, pri_id, encode_type) data = data.fillna(0) # 持久化 feather.write_dataframe(data, data_path + "data/data.feather") return data