Example #1
0
def allfeatures_001():
    train = classes.get_train_data()
    copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple'])
    day = DayTransformer()
    state = StateTransformer()
    car_val = FillEncoderBinarizer('car_value', 'z')
    risk_factor = FillEncoderBinarizer('risk_factor', 0)
    c_prev = FillEncoderBinarizer('C_previous', 0)
    c_dur = FillEncoderBinarizer('duration_previous', -1)
    last_plan = LastObservedPlan()

    features = FeatureUnion([
        ('copy', copy),
        ('day', day),
        ('state', state),
        ('car_val', car_val),
        ('risk_factor', risk_factor),
        ('c_prev', c_prev),
        ('c_dur', c_dur),
        # ('last_plan', last_plan)
    ])

    pipeline = Pipeline([
        ('filter', LastShoppingPointSelector()),
        ('features', features)
    ])

    train, test = classes.train_test_split(train)
    train_x = pipeline.fit_transform(train)
    train_y = classes.split_plan(classes.get_actual_plan(train))
    y_encoder = classes.MultiColLabelBinarizer()
    y = y_encoder.fit_transform(train_y[list('ABCDEFG')])

    # Just on one col
    est = ExtraTreesClassifier(n_estimators=100, verbose=3)
    est.fit(train_x, y)

    actuals = classes.split_plan(classes.get_actual_plan(test))
    test_x = classes.truncate(test)
    test_x = pipeline.transform(test_x)
    pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x)))

    score = classes.score_df(pred, actuals)
    scores = classes.col_score_df(pred, actuals)
Example #2
0
Can also include some features from the train set
"""
from __future__ import division
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import classes

logger = logging.getLogger('allstate')

logger.info("Loading data")
data = classes.get_train_data()
train, test = classes.train_test_split(data)

# Feature G has the lowest accuracy, so lets use all the other features to predict G
# G takes values of 1, 2, 3, and 4, we'll need to one-hot encode the response

logger.info("Transforming data")
# Transform the data into something that we can give to a learning algorithm
train_y = train.loc[train['record_type'] == 1, 'G']
train_data = train.loc[train['record_type'] == 0, ['customer_ID', 'shopping_pt', 'record_type'] + list('ABCDEFG')]
train_x = classes.get_last_observed_point(train_data)[list('ABCDEFG')]

# Responses need to be encoded to binary columns
y_encoder = OneHotEncoder()
train_y = y_encoder.fit_transform(train_y.reshape((train_y.shape[0], 1))).toarray()

# train_x is a df with columsn A-F