def logistic_001(): X, y = classes.get_train_data() y = y > 0 remove_object = classes.RemoveObjectColumns() X = remove_object.fit_transform(X) imputer = Imputer() X = imputer.fit_transform(X) scores = [] for i in range(X.shape[1]): clf = LogisticRegression() s = cross_val_score(clf, X[:, i], y, scoring='roc') scores.append((i, s))
def allfeatures_001(): train = classes.get_train_data() copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple']) day = DayTransformer() state = StateTransformer() car_val = FillEncoderBinarizer('car_value', 'z') risk_factor = FillEncoderBinarizer('risk_factor', 0) c_prev = FillEncoderBinarizer('C_previous', 0) c_dur = FillEncoderBinarizer('duration_previous', -1) last_plan = LastObservedPlan() features = FeatureUnion([ ('copy', copy), ('day', day), ('state', state), ('car_val', car_val), ('risk_factor', risk_factor), ('c_prev', c_prev), ('c_dur', c_dur), # ('last_plan', last_plan) ]) pipeline = Pipeline([ ('filter', LastShoppingPointSelector()), ('features', features) ]) train, test = classes.train_test_split(train) train_x = pipeline.fit_transform(train) train_y = classes.split_plan(classes.get_actual_plan(train)) y_encoder = classes.MultiColLabelBinarizer() y = y_encoder.fit_transform(train_y[list('ABCDEFG')]) # Just on one col est = ExtraTreesClassifier(n_estimators=100, verbose=3) est.fit(train_x, y) actuals = classes.split_plan(classes.get_actual_plan(test)) test_x = classes.truncate(test) test_x = pipeline.transform(test_x) pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x))) score = classes.score_df(pred, actuals) scores = classes.col_score_df(pred, actuals)
from __future__ import division from sklearn.preprocessing import OneHotEncoder import classes from constants import * import numpy as np import pandas as pd train_x, train_y = classes.get_train_data() # Train x is of type # <class 'pandas.core.frame.DataFrame'> # Int64Index: 105471 entries, 1 to 105471 # Columns: 769 entries, f1 to f778 # dtypes: float64(652), int64(97), object(20) # the int64s are probably good candidates for categorical variables # The object data types turn out to be really long numbers. Maybe account numbers or something? object_types_mask = train_x.dtypes == np.object object_types = train_x.loc[:, object_types_mask] object_types.iloc[0:5] # Possible leakage via the object types numbers? # Actually from the description we can see that some of these columns may be useful (the unique count is not very close to the total count) desc = object_types.describe() # IDEA - filter out columns where unique ~ count # Find out how many unique ints there are in each int64 column int_types_mask = train_x.dtypes == np.int64 int_types = train_x.loc[:, int_types_mask]
from sklearn.base import BaseEstimator, TransformerMixin import classes import numpy as np import pandas as pd X, y = classes.get_train_data() desc = X.describe() desc.iloc[:, 0:5] n_unique = [len(x.unique()) for n, x in X.iteritems()] n_na = [sum(x == np.nan) for n, x in X.iteritems()] n_na = [sum(pd.isnull(x)) for n, x in X.iteritems()] X = classes.get_train_data() remove_cols = RemoveNoVarianceColumn() remove_cols.fit(X) X = remove_cols.transform(X) test_x = classes.get_test_data() transformed_test_x = remove_cols.transform(test_x)
""" Last quoted plan benchmark """ import classes from classes import logger train = classes.get_train_data() actuals = classes.get_actual_plan(train) scores = [] # Score seems to be a bit high on training, about .547-.548 # Leaderboard score is 0.53793, so seems like 0.01 difference, which is pretty substantial in this competition for n in range(5): truncated = classes.truncate(train) prediction = classes.get_last_observed_plan(truncated) score = classes.score_df(prediction, actuals) scores.append(score) logger.info("Run {}, score: {}".format(n+1, score)) test = classes.get_test_data() pred = classes.get_last_observed_plan(test) classes.make_submission(pred, 'benchmark_001.csv')