Beispiel #1
0
def allfeatures_001():
    train = classes.get_train_data()
    copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple'])
    day = DayTransformer()
    state = StateTransformer()
    car_val = FillEncoderBinarizer('car_value', 'z')
    risk_factor = FillEncoderBinarizer('risk_factor', 0)
    c_prev = FillEncoderBinarizer('C_previous', 0)
    c_dur = FillEncoderBinarizer('duration_previous', -1)
    last_plan = LastObservedPlan()

    features = FeatureUnion([
        ('copy', copy),
        ('day', day),
        ('state', state),
        ('car_val', car_val),
        ('risk_factor', risk_factor),
        ('c_prev', c_prev),
        ('c_dur', c_dur),
        # ('last_plan', last_plan)
    ])

    pipeline = Pipeline([
        ('filter', LastShoppingPointSelector()),
        ('features', features)
    ])

    train, test = classes.train_test_split(train)
    train_x = pipeline.fit_transform(train)
    train_y = classes.split_plan(classes.get_actual_plan(train))
    y_encoder = classes.MultiColLabelBinarizer()
    y = y_encoder.fit_transform(train_y[list('ABCDEFG')])

    # Just on one col
    est = ExtraTreesClassifier(n_estimators=100, verbose=3)
    est.fit(train_x, y)

    actuals = classes.split_plan(classes.get_actual_plan(test))
    test_x = classes.truncate(test)
    test_x = pipeline.transform(test_x)
    pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x)))

    score = classes.score_df(pred, actuals)
    scores = classes.col_score_df(pred, actuals)
Beispiel #2
0
"""
Last quoted plan benchmark
"""
import classes
from classes import logger

train = classes.get_train_data()
actuals = classes.get_actual_plan(train)

scores = []
# Score seems to be a bit high on training, about .547-.548
# Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition
for n in range(5):
    truncated = classes.truncate(train)
    prediction = classes.get_last_observed_plan(truncated)
    score = classes.score_df(prediction, actuals)
    scores.append(score)
    logger.info("Run {}, score: {}".format(n+1, score))

test = classes.get_test_data()
pred = classes.get_last_observed_plan(test)

classes.make_submission(pred, 'benchmark_001.csv')
Beispiel #3
0
# Responses need to be encoded to binary columns
y_encoder = OneHotEncoder()
train_y = y_encoder.fit_transform(train_y.reshape((train_y.shape[0], 1))).toarray()

# train_x is a df with columsn A-F
# Encode each column of train_x as a one-hot binary column.
f_encoder = OneHotEncoder()
est = RandomForestClassifier(n_estimators=150, verbose=3, oob_score=True)
train_x = f_encoder.fit_transform(train_x).toarray()
# OOB score is 0.93
logger.info("Training classifier")
est.fit(train_x, train_y)

logger.info("Transforming test data")
test_y = test.loc[test['record_type'] == 1, 'G']
test_data = classes.truncate(test)
test_x = classes.get_last_observed_point(test_data)[['customer_ID'] + list('ABCDEFG')]

# Establish a baseline for what the accuracy of the last-observed on a truncated set is
last_obs = classes.concatenate_plan(test_x)[['customer_ID', 'plan']]
actuals = classes.concatenate_plan(test.loc[test['record_type'] == 1])[['customer_ID', 'plan']]
score = classes.score_df(last_obs, actuals)
scores = classes.col_score_df(classes.split_plan(last_obs), classes.split_plan(actuals))

test_y = y_encoder.transform(test_y.reshape((test_y.shape[0], 1))).toarray()
test_x = f_encoder.transform(test_x[list('ABCDEFG')]).toarray()

pred = est.predict(test_x)
# This gives what we want, but some rows don't have a prediction
# Iterate over each row of the prediction.  If the row doesn't have a 1 in some column, then set it to the last obs value
for i, row in enumerate(pred):