Ejemplo n.º 1
0
    svd_feas.columns = ['svd_profile_fea_{}'.format(i) for i in range(10)]
    svd_feas['pid'] = profile_data['pid'].values
    data['pid'] = data['pid'].fillna(-1)  # nan的pid 搞成了-1
    data = data.merge(svd_feas, on='pid', how='left')
    limit_profile_data = profile_data[[
        'pid', 'cat_p13', 'cat_p29', 'cat_p33', 'cat_p9', 'cat_p6', 'cat_p5',
        'cat_p0'
    ]]  # 这些feature对0类别应该会有好的效果
    data = data.merge(limit_profile_data, on='pid',
                      how='left')  # ---> adding origin pid features
    return data


print("Loading data...")

data = Dataset.load_part('data', 'manual')
feature = Dataset.get_part_features('manual_data')

data_df = pd.DataFrame(data, columns=feature)

result = gen_profile_feas(data_df)
result.rename(columns={'pid': 'cat_pid'}, inplace=True)

cat_columns = [c for c in result.columns if c.startswith('cat')]
svd_columns = [c for c in result.columns if c.startswith('svd')]
print('cat_columns', cat_columns)
print('svd_columns', svd_columns)

Dataset.save_part_features('categorical_profile', cat_columns)
Dataset.save_part_features('svd_profile', svd_columns)
# -*- coding: utf-8 -*-

import pandas as pd

from utils import Dataset

#计算统计融合特征
for name in ['train', 'test']:
    print("Processing %s..." % name)

    num = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                       columns=Dataset.get_part_features('numeric'))
    df = pd.DataFrame(index=num.index)

    df['diff_1_6'] = num['cont1'] - num['cont6']
    df['diff_1_9'] = num['cont1'] - num['cont9']
    df['diff_1_10'] = num['cont1'] - num['cont10']
    df['diff_6_9'] = num['cont6'] - num['cont9']
    df['diff_6_10'] = num['cont6'] - num['cont10']
    df['diff_6_11'] = num['cont6'] - num['cont11']
    df['diff_6_12'] = num['cont6'] - num['cont12']
    df['diff_6_13'] = num['cont6'] - num['cont13']
    df['diff_7_11'] = num['cont7'] - num['cont11']
    df['diff_7_12'] = num['cont7'] - num['cont12']
    df['diff_11_12'] = num['cont11'] - num['cont12']

    if name == 'train':
        Dataset.save_part_features('numeric_combinations', list(df.columns))

    Dataset(numeric_combinations=df.values).save(name)
# -*- coding: utf-8 -*-
import numpy as np

from utils import Dataset, vstack, hstack

from sklearn.preprocessing import scale
from sklearn.cluster import MiniBatchKMeans

np.random.seed(1234)

gamma = 1.0

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric')
train_cat = Dataset.load_part('train', 'categorical_dummy')

test_num = Dataset.load_part('test', 'numeric')
test_cat = Dataset.load_part('test', 'categorical_dummy')

print("Combining data...")
#vstack 按行拼接
#hstack 按列拼接
#拼接之后kmeans聚类

all_data = hstack((scale(vstack((train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat))))

for n_clusters in [25, 50, 75, 100, 200]:
    part_name = 'cluster_rbf_%d' % n_clusters

    print("Finding %d clusters..." % n_clusters)
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd

from utils import Dataset
from sklearn.preprocessing import minmax_scale

print("Loading data...")
# 这里做了很多数据分析才知道为何这么变幻会变得正常吧
train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')

print("Scaling...")

numeric = pd.DataFrame(np.vstack((train_num, test_num)), columns=Dataset.get_part_features('numeric'))

df = pd.DataFrame(index=numeric.index)
df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"]))
df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"]))
df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"]))
df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"]))
df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"]))
df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"]))
df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"]))
df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1)
df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1)
df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1)
df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1)
df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122) ** 0.25
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
from utils import Dataset

print("Loading data...")

min_freq = 10

train_cat = Dataset.load_part('train', 'categorical')
test_cat = Dataset.load_part('test', 'categorical')

train_cat_enc = []
test_cat_enc = []

cats = Dataset.get_part_features('categorical')
features = []

# 将每一个category 特征搞成dummy输出,用一个函数就行了 , 太稀疏了就压缩了

with tqdm(total=len(cats), desc='  Encoding', unit='cols') as pbar:
    for col, cat in enumerate(cats):
        value_counts = dict(
            list(zip(*np.unique(train_cat[:, col], return_counts=True))))

        print(value_counts)

        train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8)
        test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8)
Ejemplo n.º 6
0
import pandas as pd

from utils import Dataset

for name in ['train', 'test']:
    print("Processing %s..." % name)

    idx = Dataset.load_part(name, 'id')

    # Load parts
    numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                           columns=Dataset.get_part_features('numeric_lin'),
                           index=idx)
    numeric_lin = pd.DataFrame(
        Dataset.load_part(name, 'numeric_lin'),
        columns=Dataset.get_part_features('numeric_lin'),
        index=idx)

    # Build features
    df = pd.DataFrame(index=idx)
    #df['cont14'] = numeric['cont14']
    df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1']

    # Save column names
    if name == 'train':
        Dataset.save_part_features('manual', list(df.columns))

    Dataset(manual=df.values).save(name)

print("Done.")
import pandas as pd
import numpy as np

import sys

from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from statsmodels.regression.quantile_regression import QuantReg

from utils import Dataset

pred_name = sys.argv[1]

n_folds = 8

train_y = Dataset.load_part('train', 'loss')
train_x = pd.read_csv('preds/%s-train.csv' % pred_name)['loss'].values

orig_maes = []
corr_maes = []

for fold, (fold_train_idx, fold_eval_idx) in enumerate(
        KFold(len(train_y), n_folds, shuffle=True, random_state=2016)):
    fold_train_x = train_x[fold_train_idx]
    fold_train_y = train_y[fold_train_idx]

    fold_eval_x = train_x[fold_eval_idx]
    fold_eval_y = train_y[fold_eval_idx]

    model = QuantReg(fold_train_y, fold_train_x).fit(q=0.5)
import pandas as pd
import numpy as np
pd.options.display.max_rows=999
pd.options.display.max_columns = 999

from utils import Dataset,hstack

if False:
    print("Loading data...")

    split_list=[('data','manual'),('profile','categorical'),('profile','svd'),
                ('time','categorical'),('time','numeric'),('od','categorical'),
                ('od','numeric'),('plan','categorical'),('plan','numeric'),
                ('plan','svd')]

    feature_parts = [Dataset.load_part(ds, part) for ds,part in split_list]


    feature_names = [part+'_'+ds for ds,part in split_list]
    column_names=[]
    for name in feature_names:
        column_names += Dataset.get_part_features(name)

    print feature_names

    data_df = pd.DataFrame(hstack(feature_parts),columns=column_names)

    print data_df.head()

    def split_train_val(data):
        modified_array = np.delete(data.columns.values, np.where(data.columns.values == 'click_mode'))
Ejemplo n.º 9
0
import pandas as pd
import sys

from sklearn.metrics import mean_absolute_error

from utils import Dataset, load_prediction

df = pd.DataFrame({'loss': Dataset.load_part('train', 'loss')},
                  index=Dataset.load_part('train', 'id'))

edges = df['loss'].quantile([0.2, 0.4, 0.6, 0.8]).values

df['bucket'] = len(edges)
for i in reversed(range(len(edges))):
    df.loc[df['loss'] <= edges[i], 'bucket'] = i

pred = load_prediction('train', sys.argv[1])

errs = (pd.Series(pred, index=df.index) - df['loss']).abs()

print(errs.groupby(df['bucket']).mean())