# -*- coding: utf-8 -*-

import pandas as pd

from utils import Dataset

#计算统计融合特征
for name in ['train', 'test']:
    print("Processing %s..." % name)

    num = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                       columns=Dataset.get_part_features('numeric'))
    df = pd.DataFrame(index=num.index)

    df['diff_1_6'] = num['cont1'] - num['cont6']
    df['diff_1_9'] = num['cont1'] - num['cont9']
    df['diff_1_10'] = num['cont1'] - num['cont10']
    df['diff_6_9'] = num['cont6'] - num['cont9']
    df['diff_6_10'] = num['cont6'] - num['cont10']
    df['diff_6_11'] = num['cont6'] - num['cont11']
    df['diff_6_12'] = num['cont6'] - num['cont12']
    df['diff_6_13'] = num['cont6'] - num['cont13']
    df['diff_7_11'] = num['cont7'] - num['cont11']
    df['diff_7_12'] = num['cont7'] - num['cont12']
    df['diff_11_12'] = num['cont11'] - num['cont12']

    if name == 'train':
        Dataset.save_part_features('numeric_combinations', list(df.columns))

    Dataset(numeric_combinations=df.values).save(name)
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd

from utils import Dataset
from sklearn.preprocessing import minmax_scale

print("Loading data...")
# 这里做了很多数据分析才知道为何这么变幻会变得正常吧
train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')

print("Scaling...")

numeric = pd.DataFrame(np.vstack((train_num, test_num)), columns=Dataset.get_part_features('numeric'))

df = pd.DataFrame(index=numeric.index)
df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"]))
df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"]))
df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"]))
df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"]))
df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"]))
df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"]))
df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"]))
df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1)
df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1)
df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1)
df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1)
df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122) ** 0.25
Ejemplo n.º 3
0
    svd_feas['pid'] = profile_data['pid'].values
    data['pid'] = data['pid'].fillna(-1)  # nan的pid 搞成了-1
    data = data.merge(svd_feas, on='pid', how='left')
    limit_profile_data = profile_data[[
        'pid', 'cat_p13', 'cat_p29', 'cat_p33', 'cat_p9', 'cat_p6', 'cat_p5',
        'cat_p0'
    ]]  # 这些feature对0类别应该会有好的效果
    data = data.merge(limit_profile_data, on='pid',
                      how='left')  # ---> adding origin pid features
    return data


print("Loading data...")

data = Dataset.load_part('data', 'manual')
feature = Dataset.get_part_features('manual_data')

data_df = pd.DataFrame(data, columns=feature)

result = gen_profile_feas(data_df)
result.rename(columns={'pid': 'cat_pid'}, inplace=True)

cat_columns = [c for c in result.columns if c.startswith('cat')]
svd_columns = [c for c in result.columns if c.startswith('svd')]
print('cat_columns', cat_columns)
print('svd_columns', svd_columns)

Dataset.save_part_features('categorical_profile', cat_columns)
Dataset.save_part_features('svd_profile', svd_columns)

Dataset(categorical=result[cat_columns].values).save('profile')
Ejemplo n.º 4
0
import scipy.sparse as sp

from tqdm import tqdm
from utils import Dataset

print("Loading data...")

min_freq = 10

train_cat = Dataset.load_part('train', 'categorical')
test_cat = Dataset.load_part('test', 'categorical')

train_cat_enc = []
test_cat_enc = []

cats = Dataset.get_part_features('categorical')
features = []

# 将每一个category 特征搞成dummy输出,用一个函数就行了 , 太稀疏了就压缩了

with tqdm(total=len(cats), desc='  Encoding', unit='cols') as pbar:
    for col, cat in enumerate(cats):
        value_counts = dict(
            list(zip(*np.unique(train_cat[:, col], return_counts=True))))

        print(value_counts)

        train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8)
        test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8)

        for val in value_counts:
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-

import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
from utils import Dataset

print("Loading data...")

num_features = Dataset.get_part_features('numeric')

train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')

train_n = train_num.shape[0]

features = []
train_res = []
test_res = []

# 把0,1,特殊标记出来而已
with tqdm(total=train_num.shape[1], desc='  Transforming',
          unit='cols') as pbar:
    for col, col_name in enumerate(num_features):
        values = np.hstack((train_num[:, col], test_num[:, col]))

        if (values == 0.0).sum() > 20:
            features.append(col_name + '_zero')
            train_res.append(
                (values[:train_n] == 0.0).astype(np.uint8).reshape(
train_cat = Dataset.load_part('train', 'categorical')
test_cat = Dataset.load_part('test', 'categorical')

train_cat_enc = np.zeros(train_cat.shape, dtype=np.uint8)  #多维度的
test_cat_enc = np.zeros(test_cat.shape, dtype=np.uint8)

# 对所有的categories 做了计数encoding
# with tqdm(total=train_cat.shape[1], desc='  Encoding', unit='cols') as pbar:
#     for col in range(train_cat.shape[1]):
#         values = np.hstack((train_cat[:, col], test_cat[:, col]))
#         values = np.unique(values)
#         values = sorted(values, key=lambda x: (len(x), x))
#
#         encoding = dict(list(zip(values, list(range(len(values))))))#类别编号在此
#
#         train_cat_enc[:, col] = pd.Series(train_cat[:, col]).map(encoding).values
#         test_cat_enc[:, col] = pd.Series(test_cat[:, col]).map(encoding).values
#
#         pbar.update(1)
# exit(0)

print("Saving...", Dataset.get_part_features('categorical'))

Dataset.save_part_features('categorical_encoded',
                           Dataset.get_part_features('categorical'))
print(train_cat_enc)
Dataset(categorical_encoded=train_cat_enc).save('train')
Dataset(categorical_encoded=test_cat_enc).save('test')

print("Done.")
Ejemplo n.º 7
0
import pandas as pd

from utils import Dataset

for name in ['train', 'test']:
    print("Processing %s..." % name)

    idx = Dataset.load_part(name, 'id')

    # Load parts
    numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                           columns=Dataset.get_part_features('numeric_lin'),
                           index=idx)
    numeric_lin = pd.DataFrame(
        Dataset.load_part(name, 'numeric_lin'),
        columns=Dataset.get_part_features('numeric_lin'),
        index=idx)

    # Build features
    df = pd.DataFrame(index=idx)
    #df['cont14'] = numeric['cont14']
    df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1']

    # Save column names
    if name == 'train':
        Dataset.save_part_features('manual', list(df.columns))

    Dataset(manual=df.values).save(name)

print("Done.")
if False:
    print("Loading data...")

    split_list=[('data','manual'),('profile','categorical'),('profile','svd'),
                ('time','categorical'),('time','numeric'),('od','categorical'),
                ('od','numeric'),('plan','categorical'),('plan','numeric'),
                ('plan','svd')]

    feature_parts = [Dataset.load_part(ds, part) for ds,part in split_list]


    feature_names = [part+'_'+ds for ds,part in split_list]
    column_names=[]
    for name in feature_names:
        column_names += Dataset.get_part_features(name)

    print feature_names

    data_df = pd.DataFrame(hstack(feature_parts),columns=column_names)

    print data_df.head()

    def split_train_val(data):
        modified_array = np.delete(data.columns.values, np.where(data.columns.values == 'click_mode'))
        X = data[list(modified_array)].values
        y = data[['click_mode']].values
        from sklearn.model_selection import train_test_split

        print X.shape
        print y.shape