svd_feas.columns = ['svd_profile_fea_{}'.format(i) for i in range(10)] svd_feas['pid'] = profile_data['pid'].values data['pid'] = data['pid'].fillna(-1) # nan的pid 搞成了-1 data = data.merge(svd_feas, on='pid', how='left') limit_profile_data = profile_data[[ 'pid', 'cat_p13', 'cat_p29', 'cat_p33', 'cat_p9', 'cat_p6', 'cat_p5', 'cat_p0' ]] # 这些feature对0类别应该会有好的效果 data = data.merge(limit_profile_data, on='pid', how='left') # ---> adding origin pid features return data print("Loading data...") data = Dataset.load_part('data', 'manual') feature = Dataset.get_part_features('manual_data') data_df = pd.DataFrame(data, columns=feature) result = gen_profile_feas(data_df) result.rename(columns={'pid': 'cat_pid'}, inplace=True) cat_columns = [c for c in result.columns if c.startswith('cat')] svd_columns = [c for c in result.columns if c.startswith('svd')] print('cat_columns', cat_columns) print('svd_columns', svd_columns) Dataset.save_part_features('categorical_profile', cat_columns) Dataset.save_part_features('svd_profile', svd_columns)
# -*- coding: utf-8 -*- import pandas as pd from utils import Dataset #计算统计融合特征 for name in ['train', 'test']: print("Processing %s..." % name) num = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=num.index) df['diff_1_6'] = num['cont1'] - num['cont6'] df['diff_1_9'] = num['cont1'] - num['cont9'] df['diff_1_10'] = num['cont1'] - num['cont10'] df['diff_6_9'] = num['cont6'] - num['cont9'] df['diff_6_10'] = num['cont6'] - num['cont10'] df['diff_6_11'] = num['cont6'] - num['cont11'] df['diff_6_12'] = num['cont6'] - num['cont12'] df['diff_6_13'] = num['cont6'] - num['cont13'] df['diff_7_11'] = num['cont7'] - num['cont11'] df['diff_7_12'] = num['cont7'] - num['cont12'] df['diff_11_12'] = num['cont11'] - num['cont12'] if name == 'train': Dataset.save_part_features('numeric_combinations', list(df.columns)) Dataset(numeric_combinations=df.values).save(name)
# -*- coding: utf-8 -*- import numpy as np from utils import Dataset, vstack, hstack from sklearn.preprocessing import scale from sklearn.cluster import MiniBatchKMeans np.random.seed(1234) gamma = 1.0 print("Loading data...") train_num = Dataset.load_part('train', 'numeric') train_cat = Dataset.load_part('train', 'categorical_dummy') test_num = Dataset.load_part('test', 'numeric') test_cat = Dataset.load_part('test', 'categorical_dummy') print("Combining data...") #vstack 按行拼接 #hstack 按列拼接 #拼接之后kmeans聚类 all_data = hstack((scale(vstack((train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat)))) for n_clusters in [25, 50, 75, 100, 200]: part_name = 'cluster_rbf_%d' % n_clusters print("Finding %d clusters..." % n_clusters)
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from utils import Dataset from sklearn.preprocessing import minmax_scale print("Loading data...") # 这里做了很多数据分析才知道为何这么变幻会变得正常吧 train_num = Dataset.load_part('train', 'numeric') test_num = Dataset.load_part('test', 'numeric') print("Scaling...") numeric = pd.DataFrame(np.vstack((train_num, test_num)), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=numeric.index) df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"])) df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"])) df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"])) df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"])) df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"])) df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"])) df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"])) df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1) df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1) df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1) df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1) df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122) ** 0.25
# -*- coding: utf-8 -*- import numpy as np import scipy.sparse as sp from tqdm import tqdm from utils import Dataset print("Loading data...") min_freq = 10 train_cat = Dataset.load_part('train', 'categorical') test_cat = Dataset.load_part('test', 'categorical') train_cat_enc = [] test_cat_enc = [] cats = Dataset.get_part_features('categorical') features = [] # 将每一个category 特征搞成dummy输出,用一个函数就行了 , 太稀疏了就压缩了 with tqdm(total=len(cats), desc=' Encoding', unit='cols') as pbar: for col, cat in enumerate(cats): value_counts = dict( list(zip(*np.unique(train_cat[:, col], return_counts=True)))) print(value_counts) train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8) test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8)
import pandas as pd from utils import Dataset for name in ['train', 'test']: print("Processing %s..." % name) idx = Dataset.load_part(name, 'id') # Load parts numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric_lin'), index=idx) numeric_lin = pd.DataFrame( Dataset.load_part(name, 'numeric_lin'), columns=Dataset.get_part_features('numeric_lin'), index=idx) # Build features df = pd.DataFrame(index=idx) #df['cont14'] = numeric['cont14'] df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1'] # Save column names if name == 'train': Dataset.save_part_features('manual', list(df.columns)) Dataset(manual=df.values).save(name) print("Done.")
import pandas as pd import numpy as np import sys from sklearn.metrics import mean_absolute_error from sklearn.cross_validation import KFold from statsmodels.regression.quantile_regression import QuantReg from utils import Dataset pred_name = sys.argv[1] n_folds = 8 train_y = Dataset.load_part('train', 'loss') train_x = pd.read_csv('preds/%s-train.csv' % pred_name)['loss'].values orig_maes = [] corr_maes = [] for fold, (fold_train_idx, fold_eval_idx) in enumerate( KFold(len(train_y), n_folds, shuffle=True, random_state=2016)): fold_train_x = train_x[fold_train_idx] fold_train_y = train_y[fold_train_idx] fold_eval_x = train_x[fold_eval_idx] fold_eval_y = train_y[fold_eval_idx] model = QuantReg(fold_train_y, fold_train_x).fit(q=0.5)
import pandas as pd import numpy as np pd.options.display.max_rows=999 pd.options.display.max_columns = 999 from utils import Dataset,hstack if False: print("Loading data...") split_list=[('data','manual'),('profile','categorical'),('profile','svd'), ('time','categorical'),('time','numeric'),('od','categorical'), ('od','numeric'),('plan','categorical'),('plan','numeric'), ('plan','svd')] feature_parts = [Dataset.load_part(ds, part) for ds,part in split_list] feature_names = [part+'_'+ds for ds,part in split_list] column_names=[] for name in feature_names: column_names += Dataset.get_part_features(name) print feature_names data_df = pd.DataFrame(hstack(feature_parts),columns=column_names) print data_df.head() def split_train_val(data): modified_array = np.delete(data.columns.values, np.where(data.columns.values == 'click_mode'))
import pandas as pd import sys from sklearn.metrics import mean_absolute_error from utils import Dataset, load_prediction df = pd.DataFrame({'loss': Dataset.load_part('train', 'loss')}, index=Dataset.load_part('train', 'id')) edges = df['loss'].quantile([0.2, 0.4, 0.6, 0.8]).values df['bucket'] = len(edges) for i in reversed(range(len(edges))): df.loc[df['loss'] <= edges[i], 'bucket'] = i pred = load_prediction('train', sys.argv[1]) errs = (pd.Series(pred, index=df.index) - df['loss']).abs() print(errs.groupby(df['bucket']).mean())