def allfeatures_001(): train = classes.get_train_data() copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple']) day = DayTransformer() state = StateTransformer() car_val = FillEncoderBinarizer('car_value', 'z') risk_factor = FillEncoderBinarizer('risk_factor', 0) c_prev = FillEncoderBinarizer('C_previous', 0) c_dur = FillEncoderBinarizer('duration_previous', -1) last_plan = LastObservedPlan() features = FeatureUnion([ ('copy', copy), ('day', day), ('state', state), ('car_val', car_val), ('risk_factor', risk_factor), ('c_prev', c_prev), ('c_dur', c_dur), # ('last_plan', last_plan) ]) pipeline = Pipeline([ ('filter', LastShoppingPointSelector()), ('features', features) ]) train, test = classes.train_test_split(train) train_x = pipeline.fit_transform(train) train_y = classes.split_plan(classes.get_actual_plan(train)) y_encoder = classes.MultiColLabelBinarizer() y = y_encoder.fit_transform(train_y[list('ABCDEFG')]) # Just on one col est = ExtraTreesClassifier(n_estimators=100, verbose=3) est.fit(train_x, y) actuals = classes.split_plan(classes.get_actual_plan(test)) test_x = classes.truncate(test) test_x = pipeline.transform(test_x) pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x))) score = classes.score_df(pred, actuals) scores = classes.col_score_df(pred, actuals)
est = RandomForestClassifier(n_estimators=150, verbose=3, oob_score=True) train_x = f_encoder.fit_transform(train_x).toarray() # OOB score is 0.93 logger.info("Training classifier") est.fit(train_x, train_y) logger.info("Transforming test data") test_y = test.loc[test['record_type'] == 1, 'G'] test_data = classes.truncate(test) test_x = classes.get_last_observed_point(test_data)[['customer_ID'] + list('ABCDEFG')] # Establish a baseline for what the accuracy of the last-observed on a truncated set is last_obs = classes.concatenate_plan(test_x)[['customer_ID', 'plan']] actuals = classes.concatenate_plan(test.loc[test['record_type'] == 1])[['customer_ID', 'plan']] score = classes.score_df(last_obs, actuals) scores = classes.col_score_df(classes.split_plan(last_obs), classes.split_plan(actuals)) test_y = y_encoder.transform(test_y.reshape((test_y.shape[0], 1))).toarray() test_x = f_encoder.transform(test_x[list('ABCDEFG')]).toarray() pred = est.predict(test_x) # This gives what we want, but some rows don't have a prediction # Iterate over each row of the prediction. If the row doesn't have a 1 in some column, then set it to the last obs value for i, row in enumerate(pred): if row.sum() == 0: pred[i, :] = test_x[i, -4:] # Now recode into a single column, and update the last_obs tmp = np.tile(y_encoder.active_features_, (pred.shape[0], 1)) pred = tmp[pred == 1]