def allfeatures_001(): train = classes.get_train_data() copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple']) day = DayTransformer() state = StateTransformer() car_val = FillEncoderBinarizer('car_value', 'z') risk_factor = FillEncoderBinarizer('risk_factor', 0) c_prev = FillEncoderBinarizer('C_previous', 0) c_dur = FillEncoderBinarizer('duration_previous', -1) last_plan = LastObservedPlan() features = FeatureUnion([ ('copy', copy), ('day', day), ('state', state), ('car_val', car_val), ('risk_factor', risk_factor), ('c_prev', c_prev), ('c_dur', c_dur), # ('last_plan', last_plan) ]) pipeline = Pipeline([ ('filter', LastShoppingPointSelector()), ('features', features) ]) train, test = classes.train_test_split(train) train_x = pipeline.fit_transform(train) train_y = classes.split_plan(classes.get_actual_plan(train)) y_encoder = classes.MultiColLabelBinarizer() y = y_encoder.fit_transform(train_y[list('ABCDEFG')]) # Just on one col est = ExtraTreesClassifier(n_estimators=100, verbose=3) est.fit(train_x, y) actuals = classes.split_plan(classes.get_actual_plan(test)) test_x = classes.truncate(test) test_x = pipeline.transform(test_x) pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x))) score = classes.score_df(pred, actuals) scores = classes.col_score_df(pred, actuals)
""" Last quoted plan benchmark """ import classes from classes import logger train = classes.get_train_data() actuals = classes.get_actual_plan(train) scores = [] # Score seems to be a bit high on training, about .547-.548 # Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition for n in range(5): truncated = classes.truncate(train) prediction = classes.get_last_observed_plan(truncated) score = classes.score_df(prediction, actuals) scores.append(score) logger.info("Run {}, score: {}".format(n+1, score)) test = classes.get_test_data() pred = classes.get_last_observed_plan(test) classes.make_submission(pred, 'benchmark_001.csv')
# Responses need to be encoded to binary columns y_encoder = OneHotEncoder() train_y = y_encoder.fit_transform(train_y.reshape((train_y.shape[0], 1))).toarray() # train_x is a df with columsn A-F # Encode each column of train_x as a one-hot binary column. f_encoder = OneHotEncoder() est = RandomForestClassifier(n_estimators=150, verbose=3, oob_score=True) train_x = f_encoder.fit_transform(train_x).toarray() # OOB score is 0.93 logger.info("Training classifier") est.fit(train_x, train_y) logger.info("Transforming test data") test_y = test.loc[test['record_type'] == 1, 'G'] test_data = classes.truncate(test) test_x = classes.get_last_observed_point(test_data)[['customer_ID'] + list('ABCDEFG')] # Establish a baseline for what the accuracy of the last-observed on a truncated set is last_obs = classes.concatenate_plan(test_x)[['customer_ID', 'plan']] actuals = classes.concatenate_plan(test.loc[test['record_type'] == 1])[['customer_ID', 'plan']] score = classes.score_df(last_obs, actuals) scores = classes.col_score_df(classes.split_plan(last_obs), classes.split_plan(actuals)) test_y = y_encoder.transform(test_y.reshape((test_y.shape[0], 1))).toarray() test_x = f_encoder.transform(test_x[list('ABCDEFG')]).toarray() pred = est.predict(test_x) # This gives what we want, but some rows don't have a prediction # Iterate over each row of the prediction. If the row doesn't have a 1 in some column, then set it to the last obs value for i, row in enumerate(pred):