Esempio n. 1
0
def logistic_001():
    X, y = classes.get_train_data()
    y = y > 0

    remove_object = classes.RemoveObjectColumns()
    X = remove_object.fit_transform(X)

    imputer = Imputer()
    X = imputer.fit_transform(X)
    scores = []

    for i in range(X.shape[1]):
        clf = LogisticRegression()
        s = cross_val_score(clf, X[:, i], y, scoring='roc')
        scores.append((i, s))
Esempio n. 2
0
def allfeatures_001():
    train = classes.get_train_data()
    copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple'])
    day = DayTransformer()
    state = StateTransformer()
    car_val = FillEncoderBinarizer('car_value', 'z')
    risk_factor = FillEncoderBinarizer('risk_factor', 0)
    c_prev = FillEncoderBinarizer('C_previous', 0)
    c_dur = FillEncoderBinarizer('duration_previous', -1)
    last_plan = LastObservedPlan()

    features = FeatureUnion([
        ('copy', copy),
        ('day', day),
        ('state', state),
        ('car_val', car_val),
        ('risk_factor', risk_factor),
        ('c_prev', c_prev),
        ('c_dur', c_dur),
        # ('last_plan', last_plan)
    ])

    pipeline = Pipeline([
        ('filter', LastShoppingPointSelector()),
        ('features', features)
    ])

    train, test = classes.train_test_split(train)
    train_x = pipeline.fit_transform(train)
    train_y = classes.split_plan(classes.get_actual_plan(train))
    y_encoder = classes.MultiColLabelBinarizer()
    y = y_encoder.fit_transform(train_y[list('ABCDEFG')])

    # Just on one col
    est = ExtraTreesClassifier(n_estimators=100, verbose=3)
    est.fit(train_x, y)

    actuals = classes.split_plan(classes.get_actual_plan(test))
    test_x = classes.truncate(test)
    test_x = pipeline.transform(test_x)
    pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x)))

    score = classes.score_df(pred, actuals)
    scores = classes.col_score_df(pred, actuals)
Esempio n. 3
0
from __future__ import division
from sklearn.preprocessing import OneHotEncoder
import classes
from constants import *
import numpy as np
import pandas as pd

train_x, train_y = classes.get_train_data()

# Train x is of type
# <class 'pandas.core.frame.DataFrame'>
# Int64Index: 105471 entries, 1 to 105471
# Columns: 769 entries, f1 to f778
# dtypes: float64(652), int64(97), object(20)

# the int64s are probably good candidates for categorical variables
# The object data types turn out to be really long numbers.  Maybe account numbers or something?
object_types_mask = train_x.dtypes == np.object
object_types = train_x.loc[:, object_types_mask]
object_types.iloc[0:5]

# Possible leakage via the object types numbers?

# Actually from the description we can see that some of these columns may be useful (the unique count is not very close to the total count)
desc = object_types.describe()

# IDEA - filter out columns where unique ~ count

# Find out how many unique ints there are in each int64 column
int_types_mask = train_x.dtypes == np.int64
int_types = train_x.loc[:, int_types_mask]
Esempio n. 4
0
from sklearn.base import BaseEstimator, TransformerMixin
import classes
import numpy as np
import pandas as pd


X, y = classes.get_train_data()

desc = X.describe()
desc.iloc[:, 0:5]


n_unique = [len(x.unique()) for n, x in X.iteritems()]

n_na = [sum(x == np.nan) for n, x in X.iteritems()]
n_na = [sum(pd.isnull(x)) for n, x in X.iteritems()]



X = classes.get_train_data()
remove_cols = RemoveNoVarianceColumn()
remove_cols.fit(X)
X = remove_cols.transform(X)

test_x = classes.get_test_data()
transformed_test_x = remove_cols.transform(test_x)
Esempio n. 5
0
"""
Last quoted plan benchmark
"""
import classes
from classes import logger

train = classes.get_train_data()
actuals = classes.get_actual_plan(train)

scores = []
# Score seems to be a bit high on training, about .547-.548
# Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition
for n in range(5):
    truncated = classes.truncate(train)
    prediction = classes.get_last_observed_plan(truncated)
    score = classes.score_df(prediction, actuals)
    scores.append(score)
    logger.info("Run {}, score: {}".format(n+1, score))

test = classes.get_test_data()
pred = classes.get_last_observed_plan(test)

classes.make_submission(pred, 'benchmark_001.csv')