test_num = Dataset.load_part('test', 'numeric_mean')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for col in range(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))

        # Apply rank transformation
        values = rankdata(values).astype(np.float64)

        # Scale into range (-1, 1)
        values = minmax_scale(values, feature_range=(-lim, lim))

        # Make gaussian
        values = scale(erfinv(values))

        train_num_enc[:, col] = values[:train_num.shape[0]]
        test_num_enc[:, col] = values[train_num.shape[0]:]

        pbar.update(1)

print("Saving...")

Dataset.save_part_features('numeric_mean_rank_norm', Dataset.get_part_features('numeric'))
Dataset(numeric_mean_rank_norm=train_num_enc).save('train')
Dataset(numeric_mean_rank_norm=test_num_enc).save('test')

print("Done.")
Esempio n. 2
0
custom = pd.DataFrame()

custom["inc/lAmount"] = all.ApplicantIncome / all.LoanAmount
custom["prop_area_LAmount"] = all.Property_Area * all.LoanAmount
custom["chist_loanamnt"] = all.Credit_History * all.LoanAmount
custom[
    "chist_loanamnt_term"] = all.Credit_History * all.LoanAmount / all.Loan_Amount_Term
custom["allIncome"] = all.ApplicantIncome + all.CoapplicantIncome
custom["netIncome"] = custom["allIncome"] - (all.LoanAmount /
                                             all.Loan_Amount_Term)
custom["depend_chist"] = all.Dependents * all.Credit_History
custom["married_chist"] = all.Credit_History * all.Married
all_scaled = scale(custom)
#all_scaled = custom

train_custom = all_scaled[:train_cat.shape[0]]
test_custom = all_scaled[train_cat.shape[0]:]
print(train_cat.head())
print(test_custom.shape)
print(test_cat.shape)
print(list(custom.columns))
#
# train_cat_enc = sp.hstack(train_cat_enc, format='csr')
# test_cat_enc = sp.hstack(test_cat_enc, format='csr')

Dataset.save_part_features('custom', list(custom.columns))
Dataset(custom=train_custom).save('train')
Dataset(custom=test_custom).save('test')
#
# print("Done.")
train_num = Dataset.load_part('train', 'numeric')
train_cat = Dataset.load_part('train', 'categorical_dummy')

test_num = Dataset.load_part('test', 'numeric')
test_cat = Dataset.load_part('test', 'categorical_dummy')

train_cnt = train_num.shape[0]

print "Combining data..."

all_data = hstack((scale(vstack(
    (train_num, test_num)).astype(np.float64)).astype(np.float32),
                   vstack((train_cat, test_cat))))

del train_num, train_cat, test_num, test_cat

print "Fitting svd..."

svd = TruncatedSVD(n_components)
res = svd.fit_transform(all_data)

print "Explained variance ratio: %.5f" % np.sum(svd.explained_variance_ratio_)

print "Saving..."

Dataset.save_part_features('svd', ['svd%d' % i for i in xrange(n_components)])
Dataset(svd=res[:train_cnt]).save('train')
Dataset(svd=res[train_cnt:]).save('test')

print "Done."
Esempio n. 4
0
                            (train_cat.shape[0], 1))))
                test_cat_enc.append(
                    sp.csr_matrix(
                        (test_cat[:, col] == val).astype(np.uint8).reshape(
                            (test_cat.shape[0], 1))))
            else:
                train_rares += (train_cat[:, col] == val).astype(np.uint8)
                test_rares += (test_cat[:, col] == val).astype(np.uint8)

        if train_rares.sum() > 0 and test_rares.sum() > 0:
            features.append('%s_rare' % cat)
            train_cat_enc.append(
                sp.csr_matrix(train_rares.reshape((train_cat.shape[0], 1))))
            test_cat_enc.append(
                sp.csr_matrix(test_rares.reshape((test_cat.shape[0], 1))))

        pbar.update(1)

print "Created %d dummy vars" % len(features)

print "Saving..."

train_cat_enc = sp.hstack(train_cat_enc, format='csr')
test_cat_enc = sp.hstack(test_cat_enc, format='csr')

Dataset.save_part_features('categorical_dummy', features)
Dataset(categorical_dummy=train_cat_enc).save('train')
Dataset(categorical_dummy=test_cat_enc).save('test')

print "Done."
Esempio n. 5
0
train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for col in range(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))
        print(values)
        sk = skew(values)

        if sk > 0.25:
            values_enc, lam = boxcox(values+1)

            train_num_enc[:, col] = values_enc[:train_num.shape[0]]
            test_num_enc[:, col] = values_enc[train_num.shape[0]:]
        else:
            train_num_enc[:, col] = train_num[:, col]
            test_num_enc[:, col] = test_num[:, col]

        pbar.update(1)

print("Saving...")

Dataset.save_part_features('numeric_mean_boxcox', Dataset.get_part_features('numeric'))
Dataset(numeric_mean_boxcox=train_num_enc).save('train')
Dataset(numeric_mean_boxcox=test_num_enc).save('test')

print("Done.")
Esempio n. 6
0
import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    idx = Dataset.load_part(name, 'id')

    # Load parts
    numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                           columns=Dataset.get_part_features('numeric_lin'),
                           index=idx)
    numeric_lin = pd.DataFrame(
        Dataset.load_part(name, 'numeric_lin'),
        columns=Dataset.get_part_features('numeric_lin'),
        index=idx)

    # Build features
    df = pd.DataFrame(index=idx)
    #df['cont14'] = numeric['cont14']
    df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1']

    # Save column names
    if name == 'train':
        Dataset.save_part_features('manual', list(df.columns))

    Dataset(manual=df.values).save(name)

print "Done."
import pandas as pd
import numpy as np

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name
    data = pd.read_csv('../input/lin_%s.csv' % name)

    # Save column names
    if name == 'train':
        num_columns = [c for c in data.columns if c.startswith('cont')]

        Dataset.save_part_features('numeric_lin', num_columns)

    Dataset(numeric_lin=data[num_columns].values.astype(np.float32)).save(name)

print "Done."
                            (train_cat.shape[0], 1))))
                test_cat_enc.append(
                    sp.csr_matrix(
                        (test_cat[:, col] == val).astype(np.uint8).reshape(
                            (test_cat.shape[0], 1))))
            else:
                train_rares += (train_cat[:, col] == val).astype(np.uint8)
                test_rares += (test_cat[:, col] == val).astype(np.uint8)

        if train_rares.sum() > 0 and test_rares.sum() > 0:
            features.append('%s_rare' % cat)
            train_cat_enc.append(
                sp.csr_matrix(train_rares.reshape((train_cat.shape[0], 1))))
            test_cat_enc.append(
                sp.csr_matrix(test_rares.reshape((test_cat.shape[0], 1))))

        pbar.update(1)

print("Created %d dummy vars" % len(features))

print("Saving...")

train_cat_enc = sp.hstack(train_cat_enc, format='csr')
test_cat_enc = sp.hstack(test_cat_enc, format='csr')

Dataset.save_part_features('cat_man_dummy', features)
Dataset(cat_man_dummy=train_cat_enc).save('train')
Dataset(cat_man_dummy=test_cat_enc).save('test')

print("Done.")
Esempio n. 9
0
idx = Dataset.load_part("test", 'id')

test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)


all_nData = train_num.append(test_num)
print(all_nData.head())

all_num_norm = pd.DataFrame()
all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome)
all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome)
all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount))
all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term)

train_custom = all_num_norm[:train_num.shape[0]]
test_custom = all_num_norm[train_num.shape[0]:]
print(train_num.head())
print(test_custom.shape)
print(test_num.shape)
print(list(all_num_norm.columns))
#
# train_cat_enc = sp.hstack(train_cat_enc, format='csr')
# test_cat_enc = sp.hstack(test_cat_enc, format='csr')

Dataset.save_part_features('num_log1', list(all_num_norm.columns))
Dataset(num_log1=train_custom).save('train')
Dataset(num_log1=test_custom).save('test')
#
# print("Done.")
Esempio n. 10
0
train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')

print "Scaling..."

numeric = pd.DataFrame(np.vstack((train_num, test_num)),
                       columns=Dataset.get_part_features('numeric'))

df = pd.DataFrame(index=numeric.index)
df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"]))
df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"]))
df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"]))
df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"]))
df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"]))
df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"]))
df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"]))
df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1)
df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1)
df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1)
df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1)
df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122)**0.25

print "Saving..."

Dataset.save_part_features('numeric_unskew', list(df.columns))
Dataset(numeric_unskew=df.values[:train_num.shape[0]]).save('train')
Dataset(numeric_unskew=df.values[train_num.shape[0]:]).save('test')

print "Done."
Esempio n. 11
0
from tqdm import tqdm
from util import Dataset

print("Loading data...")

train_cat = Dataset.load_part('train', 'categorical_mode')
test_cat = Dataset.load_part('test', 'categorical_mode')

train_cat_counts = np.zeros(train_cat.shape, dtype=np.float32)
test_cat_counts = np.zeros(test_cat.shape, dtype=np.float32)

with tqdm(total=train_cat.shape[1], desc='  Counting', unit='cols') as pbar:
    for col in range(train_cat.shape[1]):
        train_series = pd.Series(train_cat[:, col])
        test_series = pd.Series(test_cat[:, col])

        counts = pd.concat((train_series, test_series)).value_counts()
        train_cat_counts[:, col] = train_series.map(counts).values
        test_cat_counts[:, col] = test_series.map(counts).values
        pbar.update(1)

print("Saving...")

print(train_cat_counts)
Dataset.save_part_features('categorical_counts',
                           Dataset.get_part_features('categorical'))
Dataset(categorical_counts=train_cat_counts).save('train')
Dataset(categorical_counts=test_cat_counts).save('test')

print("Done.")
print "Combining data..."

all_data = hstack((scale(vstack(
    (train_num, test_num)).astype(np.float64)).astype(np.float32),
                   vstack((train_cat, test_cat))))

for n_clusters in [25, 50, 75, 100, 200]:
    part_name = 'cluster_rbf_%d' % n_clusters

    print "Finding %d clusters..." % n_clusters

    kmeans = MiniBatchKMeans(n_clusters,
                             random_state=17 * n_clusters + 11,
                             n_init=5)
    kmeans.fit(all_data)

    print "Transforming data..."

    cluster_rbf = np.exp(-gamma * kmeans.transform(all_data))

    print "Saving..."

    Dataset.save_part_features(
        part_name,
        ['cluster_rbf_%d_%d' % (n_clusters, i) for i in xrange(n_clusters)])
    Dataset(**{part_name: cluster_rbf[:train_num.shape[0]]}).save('train')
    Dataset(**{part_name: cluster_rbf[train_num.shape[0]:]}).save('test')

print "Done."
train_cleaned.loc[:, cat_columns] = train_cleaned[cat_columns].apply(
    LabelEncoder().fit_transform)
test_cleaned.loc[:, cat_columns] = test_cleaned[cat_columns].apply(
    LabelEncoder().fit_transform)

Dataset(cat_manual=train_cleaned[cat_columns].values).save("train")
Dataset(num_manual=train_cleaned[num_columns].values.astype(np.float32)).save(
    "train")
Dataset(id=train_cleaned[id]).save("train")

Dataset(cat_manual=test_cleaned[cat_columns].values).save("test")
Dataset(num_manual=test_cleaned[num_columns].values.astype(np.float32)).save(
    "test")
Dataset(id=test_cleaned[id]).save("test")

Dataset.save_part_features('cat_manual',
                           Dataset.get_part_features('categorical_mode'))
Dataset.save_part_features('num_manual',
                           Dataset.get_part_features('numeric_mean'))

le = LabelEncoder()
le.fit(train_cleaned[target])
print(le.transform(train_cleaned[target]))
Dataset(target=le.transform(train_cleaned[target])).save("train")
Dataset(target_labels=le.classes_).save("train")

#
# train_cat_enc = sp.hstack(train_cat_enc, format='csr')
# test_cat_enc = sp.hstack(test_cat_enc, format='csr')

# Dataset.save_part_features('custom', list(custom.columns))
# Dataset(custom=train_custom).save('train')
Esempio n. 14
0
    # Save column names

    num_columns = [
        'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term'
    ]
    cat_columns = [
        c for c in data.columns if (c not in [*num_columns, target, id])
    ]

    cData_mode = data[cat_columns].copy()
    cData_mode.Credit_History.fillna(1.0, inplace=True)
    cData_mode.fillna("X", inplace=True)

    cData_mode = cData_mode.apply(LabelEncoder().fit_transform)

    Dataset.save_part_features('categorical_na_new',
                               Dataset.get_part_features('categorical_mode'))
    Dataset(categorical_na_new=cData_mode.values).save(name)

    Dataset(id=data[id]).save(name)

    if target in data.columns:
        le = LabelEncoder()
        le.fit(data[target])
        print(le.transform(data[target]))
        Dataset(target=le.transform(data[target])).save(name)
        Dataset(target_labels=le.classes_).save(name)

print("Done.")
Esempio n. 15
0
print "Loading data..."

train_cat = Dataset.load_part('train', 'categorical')
test_cat = Dataset.load_part('test', 'categorical')

train_cat_enc = np.zeros(train_cat.shape, dtype=np.uint8)
test_cat_enc = np.zeros(test_cat.shape, dtype=np.uint8)

with tqdm(total=train_cat.shape[1], desc='  Encoding', unit='cols') as pbar:
    for col in xrange(train_cat.shape[1]):
        values = np.hstack((train_cat[:, col], test_cat[:, col]))
        values = np.unique(values)
        values = sorted(values, key=lambda x: (len(x), x))

        encoding = dict(zip(values, range(len(values))))

        train_cat_enc[:, col] = pd.Series(train_cat[:,
                                                    col]).map(encoding).values
        test_cat_enc[:, col] = pd.Series(test_cat[:, col]).map(encoding).values

        pbar.update(1)

print "Saving..."

Dataset.save_part_features('categorical_encoded',
                           Dataset.get_part_features('categorical'))
Dataset(categorical_encoded=train_cat_enc).save('train')
Dataset(categorical_encoded=test_cat_enc).save('test')

print "Done."
Esempio n. 16
0
#


# all_s["app_s"] = all_s["ApplicantIncome"] * all_s["Self_Employed"]
# all_s["ci_s"] = all_s["CoapplicantIncome"] * all_s["Self_Employed"]
# all_s["la_s"] = all_s["LoanAmount"] * all_s["Self_Employed"]
# all_s["lat_s"] = all_s["Loan_Amount_Term"] * all_s["Self_Employed"]

features_to_drop = ['Gender', 'Married', 'Dependents','Education', 'Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

all_filtered = all_s.drop(features_to_drop,axis=1)

print(train.columns)
# ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
#        'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
#        'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

train_cust = all_filtered[:ntrain]
test_cust = all_filtered[ntrain:]

print(all_s.columns)
#
# train_cat_enc = sp.hstack(train_cat_enc, format='csr')
# test_cat_enc = sp.hstack(test_cat_enc, format='csr')

Dataset.save_part_features('fSelect', list(all_filtered.columns))
Dataset(fSelect=train_cust.values).save('train')
Dataset(fSelect=test_cust.values).save('test')
#
# print("Done.")
Esempio n. 17
0
    data = pd.read_csv('input/%s.csv' % name)
    target = "Loan_Status"
    id = "Loan_ID"
    # Save column names
    if name == 'train':
        num_columns = [
            'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
            'Loan_Amount_Term'
        ]
        cat_columns = [
            c for c in data.columns if (c not in [*num_columns, target, id])
        ]

        print(cat_columns)
        print(num_columns)
        Dataset.save_part_features('categorical_mode', cat_columns)
        Dataset.save_part_features('numeric_mean', num_columns)
        Dataset.save_part_features('numeric_median', num_columns)

    cData_mode = data[cat_columns].copy()
    cat_mode = cData_mode.mode().ix[0]
    cData_mode.fillna(cat_mode, inplace=True)

    cData_mode = cData_mode.apply(LabelEncoder().fit_transform)

    #print(cat_data.isnull().sum())
    numData_mean = data[num_columns].copy()
    num_mean = numData_mean.mean()
    numData_mean.fillna(num_mean, inplace=True)

    numData_median = data[num_columns].copy()
test_num = Dataset.load_part('test', 'numeric_mean')
ntrain = train_num.shape[0]

train_test = np.vstack([train_num, test_num])
num_features = Dataset.get_part_features('numeric')
num_comb_df = pd.DataFrame()

with tqdm(total=train_num.shape[1], desc='  Transforming',
          unit='cols') as pbar:
    for comb in itertools.combinations(num_features, 2):
        feat = comb[0] + "_" + comb[1]

        num_comb_df[
            feat] = train_test[:, num_features.index(comb[0]) -
                               1] + train_test[:,
                                               num_features.index(comb[1]) - 1]
        print('Combining Columns:', feat)

print("Saving...")
print(num_comb_df.shape)
Dataset.save_part_features('numeric_comb', list(num_comb_df.columns))

train_num_comb = num_comb_df[:ntrain]
test_num_comb = num_comb_df[ntrain:]
num_comb_df = scale(num_comb_df)

Dataset(numeric_comb=train_num_comb.values).save('train')
Dataset(numeric_comb=test_num_comb.values).save('test')

print("Done.")
Esempio n. 19
0
import numpy as np

from util import Dataset
from sklearn.preprocessing import scale

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

print("Scaling...")

all_scaled = scale(np.vstack((train_num, test_num)))

print("Saving...")

Dataset.save_part_features('numeric_mean_scaled',
                           Dataset.get_part_features('numeric'))
Dataset(numeric_mean_scaled=all_scaled[:train_num.shape[0]]).save('train')
Dataset(numeric_mean_scaled=all_scaled[train_num.shape[0]:]).save('test')

print("Done.")
import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    num = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                       columns=Dataset.get_part_features('numeric'))
    df = pd.DataFrame(index=num.index)

    df['diff_1_6'] = num['cont1'] - num['cont6']
    df['diff_1_9'] = num['cont1'] - num['cont9']
    df['diff_1_10'] = num['cont1'] - num['cont10']
    df['diff_6_9'] = num['cont6'] - num['cont9']
    df['diff_6_10'] = num['cont6'] - num['cont10']
    df['diff_6_11'] = num['cont6'] - num['cont11']
    df['diff_6_12'] = num['cont6'] - num['cont12']
    df['diff_6_13'] = num['cont6'] - num['cont13']
    df['diff_7_11'] = num['cont7'] - num['cont11']
    df['diff_7_12'] = num['cont7'] - num['cont12']
    df['diff_11_12'] = num['cont11'] - num['cont12']

    if name == 'train':
        Dataset.save_part_features('numeric_combinations', list(df.columns))

    Dataset(numeric_combinations=df.values).save(name)

print "Done."
Esempio n. 21
0
import pandas as pd
import numpy as np

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name
    data = pd.read_csv('../input/%s.csv.zip' % name)

    # Save column names
    if name == 'train':
        cat_columns = [c for c in data.columns if c.startswith('cat')]
        num_columns = [c for c in data.columns if c.startswith('cont')]

        Dataset.save_part_features('categorical', cat_columns)
        Dataset.save_part_features('numeric', num_columns)

    Dataset(categorical=data[cat_columns].values).save(name)
    Dataset(numeric=data[num_columns].values.astype(np.float32)).save(name)
    Dataset(id=data['id']).save(name)

    if 'loss' in data.columns:
        Dataset(loss=data['loss']).save(name)

print "Done."