def get_interactions_features_pipeline(): categorical_preprocessors = [] for feature in settings.CATEGORICAL: categorical_preprocessors.append(( feature, 'C', Pipeline([ ('string-to-int', StringToInt()), # ('higher', HighOrderFeatures()) # ('one-hot', OneHotEncoder()) ]))) # features = [] # for feature in settings.FEATURES: # if feature in settings.CATEGORICAL: # features.append(feature + '_C') # else: # features.append(feature) # print(features) # all_preprocessors = [] # for feature in features: # all_preprocessors.append( # (feature, '', OneHotEncoder()) # ) # print(categorical_preprocessors[0]) pipeline = Pipeline([ ('original', FeatureColumnsExtractor(settings.FEATURES)), ('string_to_int->one_hot', DataFrameMapper(categorical_preprocessors, return_df=True, rest_unchanged=False)), ('higher-order', HighOrderFeatures()), ('one-hot', OneHotEncoder()), # ('high-correlations', HighCorrelationFilter(threshold=0.95)), #0.3507 # ('one-hot', DataFrameMapper( # all_preprocessors # , return_df=True, rest_unchanged=True)) ]) return pipeline
if __name__ == '__main__': orig_dataset = pd.read_csv(settings.TRAIN_FILE) # sample_mask = np.zeros((orig_dataset.shape[0],), dtype=np.bool_) # sample_idx = sample_without_replacement(orig_dataset.shape[0], orig_dataset.shape[0] * 1.0, random_state=42) # sample_mask[sample_idx] = True before = time.time() fcols = [col for col in orig_dataset.columns if col in settings.FEATURES] catconversion = FeatureUnion([feature_sets.CATEGORICAL_CONVERSION], n_jobs=1) dataset = pd.DataFrame(data=catconversion.fit_transform(orig_dataset), columns=fcols, index=orig_dataset.index) target = FeatureColumnsExtractor( settings.TARGET).fit_transform(orig_dataset).apply(nonlinearity) print('original dataset shape:', dataset.shape) # union = get_feature_union() # dataset = union.fit_transform(dataset, target) print('preprocessed dataset shape:', dataset.shape) print('preprocessing time: ', time.time() - before) # cv = KFold(len(target), n_folds=10, random_state=2, shuffle=False) cv = KFold(len(target), n_folds=4, random_state=2, shuffle=False) # from src.cross_validation import RepeatedKFold # cv = RepeatedKFold(len(target), n_folds=4, n_repeats=2, random_state=3) estimators_pipeline = get_estimation_pipeline()
random_state=22, loss='squared_epsilon_insensitive'))]) return pipeline def overall_pipeline(): return Pipeline([ ('features', get_feature_union()), # ('filters', get_filters()), ('estimators', get_estimation_pipeline()) ]) if __name__ == '__main__': original_dataset = pd.read_csv(settings.TRAIN_FILE) target = FeatureColumnsExtractor(settings.TARGET).fit_transform( original_dataset).apply(lambda x: np.sqrt(x)) pipeline = overall_pipeline() pipeline.fit(original_dataset, target) original_test_set = pd.read_csv(settings.TEST_FILE) predictions = pipeline.predict(original_test_set) output = pd.DataFrame({ 'Id': original_test_set['Id'], 'Hazard': predictions }) output.to_csv(settings.SUBMIT_SVM_REDUCED, index=False, header=True, columns=['Id', 'Hazard'])
from __future__ import division, print_function # noinspection PyUnresolvedReferences from py3compatibility import * from kaggle_tools.feature_extraction import FeatureColumnsExtractor import pandas as pd import numpy as np import settings submittion_files = [settings.SUBMIT_RIDGE_SQRT_REDUCED, settings.SUBMIT_RIDGE_LOG]#, settings.SUBMIT_RIDGE_DIRECT] weights = [0.5, 0.5]#, 0.3] dfs = [] for f in submittion_files: df = pd.read_csv(f) dfs.append(FeatureColumnsExtractor(settings.TARGET).fit_transform(df).values) submittions = np.array(dfs).T print(submittions) stacked_predictions = np.average(submittions, axis=1) print(stacked_predictions) output = pd.DataFrame({'Id': df['Id'], 'Hazard': stacked_predictions}) output.to_csv(settings.SUBMIT_FILE_STACKED, index=False, header=True, columns=['Id', 'Hazard'])
from __future__ import division from __future__ import print_function import pandas as pd import numpy as np import matplotlib.pyplot as plt from kaggle_tools.feature_extraction import FeatureColumnsExtractor import settings from src.main import get_preprocessing_pipeline, get_estimation_pipeline, get_interactions_features_pipeline dataset = pd.read_csv(settings.TRAIN_FILE) target = FeatureColumnsExtractor(settings.TARGET).fit_transform(dataset) # dataset = get_preprocessing_pipeline().fit_transform(dataset) dataset = get_interactions_features_pipeline().fit_transform(dataset) if hasattr(dataset, 'toarray'): dataset = dataset.toarray() print(dataset.shape) devs = np.std(dataset, axis=0) plt.bar(np.arange(devs.shape[0]), np.sort(devs)) plt.show()
from sklearn.base import BaseEstimator, TransformerMixin class Identity(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return X CATEGORICAL_CONVERSION = ('Categorical-Conversion', Pipeline([ ('original', FeatureColumnsExtractor(settings.FEATURES)), ('StringToInt', TransformFeatureSet(settings.CATEGORICAL, transformer=StringToInt())), ])) DIRECT = ( 'Direct', Pipeline([ ('identity', TransformFeatureSet(settings.FEATURES, Identity())) # ('original', FeatureColumnsExtractor(settings.FEATURES)), # ('StringToInt', TransformFeatureSet(settings.CATEGORICAL, transformer=StringToInt())), ])) # DIRECT = ('Direct', Pipeline([ # ('original', FeatureColumnsExtractor(settings.FEATURES)), # ('StringToInt', TransformFeatureSet(settings.CATEGORICAL, transformer=StringToInt())),
from __future__ import division, print_function import pandas as pd import numpy as np from kaggle_tools.feature_extraction import FeatureColumnsExtractor from src.main import get_whole_dataset, get_feature_union import settings orig_dataset = pd.read_csv(settings.TRAIN_FILE) target = FeatureColumnsExtractor(settings.TARGET).fit_transform(orig_dataset) dataset = get_feature_union().fit_transform(get_whole_dataset(), np.empty(())) import matplotlib import matplotlib.pyplot as plt import seaborn as sns import settings from matplotlib.backends.backend_pdf import PdfPages pp = PdfPages('summary.pdf') for column in sorted(dataset.columns): # if column == settings.TARGET: # continue # column = 'Age' fig = plt.figure() title_string = "'{0}' summary".format(column) if column in settings.CATEGORICAL: title_string += ' CATEGORICAL' fig.suptitle(title_string, fontsize=14)