# -*- coding: utf-8 -*- import pandas as pd from utils import Dataset #计算统计融合特征 for name in ['train', 'test']: print("Processing %s..." % name) num = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=num.index) df['diff_1_6'] = num['cont1'] - num['cont6'] df['diff_1_9'] = num['cont1'] - num['cont9'] df['diff_1_10'] = num['cont1'] - num['cont10'] df['diff_6_9'] = num['cont6'] - num['cont9'] df['diff_6_10'] = num['cont6'] - num['cont10'] df['diff_6_11'] = num['cont6'] - num['cont11'] df['diff_6_12'] = num['cont6'] - num['cont12'] df['diff_6_13'] = num['cont6'] - num['cont13'] df['diff_7_11'] = num['cont7'] - num['cont11'] df['diff_7_12'] = num['cont7'] - num['cont12'] df['diff_11_12'] = num['cont11'] - num['cont12'] if name == 'train': Dataset.save_part_features('numeric_combinations', list(df.columns)) Dataset(numeric_combinations=df.values).save(name)
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from utils import Dataset from sklearn.preprocessing import minmax_scale print("Loading data...") # 这里做了很多数据分析才知道为何这么变幻会变得正常吧 train_num = Dataset.load_part('train', 'numeric') test_num = Dataset.load_part('test', 'numeric') print("Scaling...") numeric = pd.DataFrame(np.vstack((train_num, test_num)), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=numeric.index) df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"])) df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"])) df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"])) df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"])) df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"])) df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"])) df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"])) df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1) df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1) df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1) df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1) df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122) ** 0.25
svd_feas['pid'] = profile_data['pid'].values data['pid'] = data['pid'].fillna(-1) # nan的pid 搞成了-1 data = data.merge(svd_feas, on='pid', how='left') limit_profile_data = profile_data[[ 'pid', 'cat_p13', 'cat_p29', 'cat_p33', 'cat_p9', 'cat_p6', 'cat_p5', 'cat_p0' ]] # 这些feature对0类别应该会有好的效果 data = data.merge(limit_profile_data, on='pid', how='left') # ---> adding origin pid features return data print("Loading data...") data = Dataset.load_part('data', 'manual') feature = Dataset.get_part_features('manual_data') data_df = pd.DataFrame(data, columns=feature) result = gen_profile_feas(data_df) result.rename(columns={'pid': 'cat_pid'}, inplace=True) cat_columns = [c for c in result.columns if c.startswith('cat')] svd_columns = [c for c in result.columns if c.startswith('svd')] print('cat_columns', cat_columns) print('svd_columns', svd_columns) Dataset.save_part_features('categorical_profile', cat_columns) Dataset.save_part_features('svd_profile', svd_columns) Dataset(categorical=result[cat_columns].values).save('profile')
import scipy.sparse as sp from tqdm import tqdm from utils import Dataset print("Loading data...") min_freq = 10 train_cat = Dataset.load_part('train', 'categorical') test_cat = Dataset.load_part('test', 'categorical') train_cat_enc = [] test_cat_enc = [] cats = Dataset.get_part_features('categorical') features = [] # 将每一个category 特征搞成dummy输出,用一个函数就行了 , 太稀疏了就压缩了 with tqdm(total=len(cats), desc=' Encoding', unit='cols') as pbar: for col, cat in enumerate(cats): value_counts = dict( list(zip(*np.unique(train_cat[:, col], return_counts=True)))) print(value_counts) train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8) test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8) for val in value_counts:
# -*- coding: utf-8 -*- import numpy as np import scipy.sparse as sp from tqdm import tqdm from utils import Dataset print("Loading data...") num_features = Dataset.get_part_features('numeric') train_num = Dataset.load_part('train', 'numeric') test_num = Dataset.load_part('test', 'numeric') train_n = train_num.shape[0] features = [] train_res = [] test_res = [] # 把0,1,特殊标记出来而已 with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col, col_name in enumerate(num_features): values = np.hstack((train_num[:, col], test_num[:, col])) if (values == 0.0).sum() > 20: features.append(col_name + '_zero') train_res.append( (values[:train_n] == 0.0).astype(np.uint8).reshape(
train_cat = Dataset.load_part('train', 'categorical') test_cat = Dataset.load_part('test', 'categorical') train_cat_enc = np.zeros(train_cat.shape, dtype=np.uint8) #多维度的 test_cat_enc = np.zeros(test_cat.shape, dtype=np.uint8) # 对所有的categories 做了计数encoding # with tqdm(total=train_cat.shape[1], desc=' Encoding', unit='cols') as pbar: # for col in range(train_cat.shape[1]): # values = np.hstack((train_cat[:, col], test_cat[:, col])) # values = np.unique(values) # values = sorted(values, key=lambda x: (len(x), x)) # # encoding = dict(list(zip(values, list(range(len(values))))))#类别编号在此 # # train_cat_enc[:, col] = pd.Series(train_cat[:, col]).map(encoding).values # test_cat_enc[:, col] = pd.Series(test_cat[:, col]).map(encoding).values # # pbar.update(1) # exit(0) print("Saving...", Dataset.get_part_features('categorical')) Dataset.save_part_features('categorical_encoded', Dataset.get_part_features('categorical')) print(train_cat_enc) Dataset(categorical_encoded=train_cat_enc).save('train') Dataset(categorical_encoded=test_cat_enc).save('test') print("Done.")
import pandas as pd from utils import Dataset for name in ['train', 'test']: print("Processing %s..." % name) idx = Dataset.load_part(name, 'id') # Load parts numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric_lin'), index=idx) numeric_lin = pd.DataFrame( Dataset.load_part(name, 'numeric_lin'), columns=Dataset.get_part_features('numeric_lin'), index=idx) # Build features df = pd.DataFrame(index=idx) #df['cont14'] = numeric['cont14'] df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1'] # Save column names if name == 'train': Dataset.save_part_features('manual', list(df.columns)) Dataset(manual=df.values).save(name) print("Done.")
if False: print("Loading data...") split_list=[('data','manual'),('profile','categorical'),('profile','svd'), ('time','categorical'),('time','numeric'),('od','categorical'), ('od','numeric'),('plan','categorical'),('plan','numeric'), ('plan','svd')] feature_parts = [Dataset.load_part(ds, part) for ds,part in split_list] feature_names = [part+'_'+ds for ds,part in split_list] column_names=[] for name in feature_names: column_names += Dataset.get_part_features(name) print feature_names data_df = pd.DataFrame(hstack(feature_parts),columns=column_names) print data_df.head() def split_train_val(data): modified_array = np.delete(data.columns.values, np.where(data.columns.values == 'click_mode')) X = data[list(modified_array)].values y = data[['click_mode']].values from sklearn.model_selection import train_test_split print X.shape print y.shape