Example #1
0
def allfeatures_001():
    train = classes.get_train_data()
    copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple'])
    day = DayTransformer()
    state = StateTransformer()
    car_val = FillEncoderBinarizer('car_value', 'z')
    risk_factor = FillEncoderBinarizer('risk_factor', 0)
    c_prev = FillEncoderBinarizer('C_previous', 0)
    c_dur = FillEncoderBinarizer('duration_previous', -1)
    last_plan = LastObservedPlan()

    features = FeatureUnion([
        ('copy', copy),
        ('day', day),
        ('state', state),
        ('car_val', car_val),
        ('risk_factor', risk_factor),
        ('c_prev', c_prev),
        ('c_dur', c_dur),
        # ('last_plan', last_plan)
    ])

    pipeline = Pipeline([
        ('filter', LastShoppingPointSelector()),
        ('features', features)
    ])

    train, test = classes.train_test_split(train)
    train_x = pipeline.fit_transform(train)
    train_y = classes.split_plan(classes.get_actual_plan(train))
    y_encoder = classes.MultiColLabelBinarizer()
    y = y_encoder.fit_transform(train_y[list('ABCDEFG')])

    # Just on one col
    est = ExtraTreesClassifier(n_estimators=100, verbose=3)
    est.fit(train_x, y)

    actuals = classes.split_plan(classes.get_actual_plan(test))
    test_x = classes.truncate(test)
    test_x = pipeline.transform(test_x)
    pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x)))

    score = classes.score_df(pred, actuals)
    scores = classes.col_score_df(pred, actuals)
Example #2
0
"""
Last quoted plan benchmark
"""
import classes
from classes import logger

train = classes.get_train_data()
actuals = classes.get_actual_plan(train)

scores = []
# Score seems to be a bit high on training, about .547-.548
# Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition
for n in range(5):
    truncated = classes.truncate(train)
    prediction = classes.get_last_observed_plan(truncated)
    score = classes.score_df(prediction, actuals)
    scores.append(score)
    logger.info("Run {}, score: {}".format(n+1, score))

test = classes.get_test_data()
pred = classes.get_last_observed_plan(test)

classes.make_submission(pred, 'benchmark_001.csv')