class Transformer(object): """ The purpose of this class is to take a dataframe and transform it into a numpy array compatible format. """ def __init__(self, config): self.__config = config self.__mapper = None self.__label_encoder_adapter = TransformerAdapter(LabelEncoderMissingValuesTransformer()) def prepare(self, dataframe): """ Takes the already cleaned dataframe, splits it into train and test and returns the train and test as numpy arrays. If the problem is supervised, the target column will be that last one of the returned arrays. """ mapping = DataFrameMapCreator().get_mapping_from_config(self.__config) self.__mapper = DataFrameMapper(mapping) train, test = split_dataframe_train_test(dataframe, self.__config.get_option_parameter("split", "train_percentage")) return self.__get_correct_return_parameters(train, test) def __get_correct_return_parameters(self, train, test): model = self.__config.get_data_model() train_transformed = self.__mapper.fit_transform(train) test_transformed = self.__mapper.transform(test) if model.has_target(): return self.__add_target_data(train_transformed, train), \ self.__add_target_data(test_transformed, test) else: return train_transformed, test_transformed def __add_target_data(self, transformed_data, original_data): """ Picks up the target data from the original_data and appends it as a column to the transformed_data. Both arguments are expected to be np.array's. """ model = self.__config.get_data_model() target_feature = model.find_target_feature() name = target_feature.get_name() if target_feature.is_categorical(): target_row = original_data[name] target = self.__label_encoder_adapter.transform(target_row) else: target = original_data[name].values.astype(type_name_to_data_type("float")) target = target[..., None] return np.hstack((transformed_data, target)) def apply(self, dataframe): return self.__mapper.transform(dataframe)
def transform_features(self): totransform = [] for index, item in enumerate(self.feat_head): field = item[0] func_name = item[1] transform = item[2] is_enable = item[3] if is_enable: if not field in self.stumble_data.get_features(): print 'field not in feature..generating:' + field func_name(field) totransform.append((field, transform)) if len(totransform): mapper = DataFrameMapper(totransform) mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train]) # X_transformed_train = mapper.transform( self.stumble_data.all_pd[:self.stumble_data.len_train]) X_transformed_test = mapper.transform( self.stumble_data.all_pd[self.stumble_data.len_train:]) for index, item in enumerate(self.feat_head): field = item[0] is_enable = item[3] if is_enable and field in self.stumble_data.get_features(): del self.stumble_data.all_pd[field] import pdb pdb.set_trace() from scipy.sparse import hstack X_train = X_transformed_train X_test = X_transformed_test y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label'] # print 'Dumping train in SVMLight.' dump_svmlight_file(X_train, y_train, output_train_libsvm_file ) # print 'Dumping test in SVMLight.' # dump_svmlight_file(X_test, pred, output_test_libsvm_file ) else: X_train = X_train.as_matrix() X_test = X_test.as_matrix() return X_train, y_train, X_test
def prepare_data(df_train, df_test,name): """ Define the input and output sets formated to use for neural network model # Arguments df_train: training set with all input variables, survival time and censoring status df_test: test set with all input variables, survival time and censoring status name: name of the model (CoxCC, CoxTime or DeepHit) # Returns x_train: input variables for the training set y_train: output variables for the training set x_test: input variables for the test set duration_test: survival time for the test set event_test: censoring indicator for the test set labtrans: output variables transformed for specific models (DeepHit ad CoxTime) """ col_list = list(df_train.columns) cols_standardize = [e for e in col_list if e not in ['yy', 'status']] standardize = [([col], StandardScaler()) for col in cols_standardize] x_mapper = DataFrameMapper(standardize) x_train = x_mapper.fit_transform(df_train).astype('float32') x_test = x_mapper.transform(df_test).astype('float32') get_target = lambda df: (df['yy'].values, df['status'].values) if name=="DeepHit" : num_durations = 10 labtrans = DeepHitSingle.label_transform(num_durations) y_train = labtrans.fit_transform(*get_target(df_train)) elif name=="CoxTime": labtrans = CoxTime.label_transform() y_train = labtrans.fit_transform(*get_target(df_train)) else : labtrans = "" y_train = get_target(df_train) duration_test, event_test = get_target(df_test) return x_train, y_train, x_test, duration_test, event_test,labtrans
def scale_vars(df, mapper): warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def train_fn(args): print("loading data") train_df = pd.read_csv(args.train_data + "/train.csv", engine='python') test_df = pd.read_csv(args.test_data + "/test.csv", engine='python') TARGET = 'SeriousDlqin2yrs' X_train = train_df.drop(TARGET, axis=1) y_train = train_df[TARGET] X_test = test_df.drop(TARGET, axis=1) y_test = test_df[TARGET] print("Imputing missing values") transformer = DataFrameMapper([ (['MonthlyIncome'], DFImputer(strategy="constant", fill_value=-1)), (['age'], DFImputer(strategy="median")), (['NumberOfDependents'], DFImputer(strategy="median")), (['DebtRatio'], DFImputer(strategy="median")), (['RevolvingUtilizationOfUnsecuredLines' ], DFImputer(strategy="median")), (['NumberRealEstateLoansOrLines'], DFImputer(strategy="median")), (['NumberOfOpenCreditLinesAndLoans'], DFImputer(strategy="median")), (['NumberOfTime30-59DaysPastDueNotWorse' ], DFImputer(strategy="median")), (['NumberOfTime60-89DaysPastDueNotWorse' ], DFImputer(strategy="median")), (['NumberOfTimes90DaysLate'], DFImputer(strategy="median")), ], input_df=True, df_out=True) transformer.fit(X_train) X_train = transformer.transform(X_train) X_test = transformer.transform(X_test) print("Building model...") model = RandomForestClassifier(n_estimators=50, max_depth=6, max_leaf_nodes=30) model.fit(X_train, y_train) explainer = shap.TreeExplainer(model) print("Saving artifacts...") model_dir = Path(args.model_dir) model_dir.mkdir(exist_ok=True, parents=True) joblib.dump(transformer, open(str(model_dir / "transformer.joblib"), "wb")) joblib.dump(model, open(str(model_dir / "model.joblib"), "wb")) joblib.dump(explainer, open(str(model_dir / "explainer.joblib"), "wb"))
def scale_vars(df, mapper): # warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) # on soustrait aux données leur moyenne empirique # 𝜇 on les divisent par leur écart-type 𝛿 if mapper is None: map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def execute(self, data, transforms=None, target=None, mapper=None, drop=None, default=None, **kwargs): try: if not data.empty: features, labels = data, None if mapper is None: mapping = [] if transforms: mapping = [(feature, transform_method(**kwargs)) for feature, transform_method in transforms] mapper = DataFrameMapper(mapping, df_out=True, default=default) if target is not None: features, labels = data.drop(target, axis=1), data[target] if labels is None: features = mapper.fit_transform(features.copy()) else: features = mapper.fit_transform( features.copy(), labels.copy()) else: features = mapper.transform(features.copy()) if drop: features.drop( [col for col in drop if col in features.columns], axis=1, inplace=True) data = features if labels is not None: data[target] = labels else: raise AttributeError("No data provided") except Exception: print(traceback.format_exc()) logging.error(traceback.format_exc()) return data, mapper
def test_exception_column_context_transform(simple_dataframe): """ If an exception is raised when transforming a column, the exception includes the name of the column being transformed """ class FailingTransformer(object): def fit(self, X): pass def transform(self, X): raise Exception('Some exception') df = simple_dataframe mapper = DataFrameMapper([('a', FailingTransformer())]) mapper.fit(df) with pytest.raises(Exception, match='a: Some exception'): mapper.transform(df)
def transform_cat_to_cont(df, cat_features, cont_features): feature_defs = [] for col_name in cat_features: feature_defs.append((col_name, MyLabelBinarizer())) for col_name in cont_features: feature_defs.append((col_name, None)) mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True) mapper.fit(df) return mapper.transform(df)
def scale_vars(df, mapper): """ Standardize numerical features by removing the mean and scaling to unit variance. """ warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def test_numerical_transformer_serialization(simple_dataset): """ Test if you can serialize transformer """ transfomer = DataFrameMapper([('feat1', NumericalTransformer('log'))]) df = simple_dataset transfomer.fit(df) f = tempfile.NamedTemporaryFile(delete=True) joblib.dump(transfomer, f.name) transfomer2 = joblib.load(f.name) np.array_equal(transfomer.transform(df), transfomer2.transform(df)) f.close()
def pre_processing(self): self.__numeric_header = [i for i in self.__train_feature.columns if i not in self.__categorical_header] self.__train_categorical = self.__train_feature[self.__categorical_header] self.__train_numeric = self.__train_feature[self.__numeric_header] self.__test_categorical = self.__test_feature[self.__categorical_header] self.__test_numeric = self.__test_feature[self.__numeric_header] self.__train_categorical = self.__train_categorical.astype(str) self.__test_categorical = self.__test_categorical.astype(str) self.__train_categorical = self.__train_categorical.fillna("missing") self.__test_categorical = self.__test_categorical.fillna("missing") mapper = DataFrameMapper([(i, LabelEncoder()) for i in self.__train_categorical.columns]) mapper.fit(self.__train_categorical) self.__train_categorical = pd.DataFrame(mapper.transform(self.__train_categorical), columns=self.__train_categorical.columns) self.__test_categorical = pd.DataFrame(mapper.transform(self.__test_categorical), columns=self.__test_categorical.columns) self.__train_numeric = self.__train_numeric.fillna(-999) self.__test_numeric = self.__test_numeric.fillna(-999) self.__train_feature = pd.concat([self.__train_numeric, self.__train_categorical], axis=1) self.__test_feature = pd.concat([self.__test_numeric, self.__test_categorical], axis=1) self.__train_feature = self.__train_feature.values self.__test_feature = self.__test_feature.values
def test_mapper(self): data = lib.load_titanic() transformation_list = [(['name'], [EmbeddingVectorizer(max_sequence_length=12)])] mapper = DataFrameMapper(transformation_list, df_out=True) mapper.fit(data) data_transformed = mapper.transform(data) assert_array_equal([2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1], data_transformed.values[0, :])
class Preprocessor: mapper: DataFrameMapper def __init__(self): self.mapper = DataFrameMapper([(encoding_fields, [ SimpleImputer(strategy="most_frequent"), preprocessing.OrdinalEncoder() ]), (scale_fields, preprocessing.StandardScaler())]) def train(self, x: pd.DataFrame): self.mapper.fit(x) def transform(self, x: pd.DataFrame): return self.mapper.transform(x)
class imbalanceOversampleProcess: ''' numericFeature: 列表,数值型特征列表 OversampleParamDict:字典{key,value},key限定为['RandomSample','Smote','ADASYN',\ 'SMOTEENN','SMOTETomek']这几种,value为参数dict estimator:需训练的模型,参数要进行初始化 ''' def __init__(self, numericFeature, OversampleParamDict, estimator): self.numericFeature = numericFeature self.OversampleParamDict = OversampleParamDict self.estimator = estimator self.dataTranformer = DataFrameMapper([(self.numericFeature,\ [ContinuousDomain(),SimpleImputer(strategy='mean'), StandardScaler()])]) def _generateModel(self, key, paramDict): if key == 'RandomSample': self.model = RandomOverSampler(**paramDict) elif key == 'Smote': self.model = SMOTE(**paramDict) elif key == 'ADASYN': self.model = ADASYN(**paramDict) elif key == 'SMOTEENN': self.model = SMOTEENN(**paramDict) elif key == 'SMOTETomek': self.model = SMOTETomek(**paramDict) else: assert key not in ['RandomSample','Smote','ADASYN',\ 'SMOTEENN','SMOTETomek'],'请输入RandomSample,Smote,\ ADASYN,SMOTEENN,SMOTETomek中任意一种!' def _fitSample(self, X, y): XTransform = self.dataTranformer.fit_transform(X) assert len(self.OversampleParamDict) == 1, '只支持单模型输出,字典只能放一组模型参数!' for key, value in self.OversampleParamDict.items(): self._generateModel(key, value) X_train, y_train = self.model.fit_sample(XTransform, y) self.X_train_sample = pd.DataFrame(data=X_train, columns=self.numericFeature) self.y_train_sample = y_train def fit(self, X, y): self._fitSample(X, y) self.estimator.fit(self.X_train_sample, self.y_train_sample) def predict_proba(self, X): XTransformTest = self.dataTranformer.transform(X) X_test = pd.DataFrame(data=XTransformTest, columns=self.numericFeature) self.predictResult = self.estimator.predict_proba(X_test) return self.predictResult
def run_pipeline( data, onehot_cols, ordinal_cols, batch_size, validate=True, ): X = data.drop(columns=["fraction_recovered"]) y = data["fraction_recovered"] X_train, X_valid, y_train, y_valid = ( train_test_split(X, y, test_size=0.2, random_state=0) if validate else (X, None, y, None) ) transformer = DataFrameMapper( [ (onehot_cols, OneHotEncoder(drop="if_binary")), ( list(ordinal_cols.keys()), OrdinalEncoder(categories=list(ordinal_cols.values())), ), ], default=StandardScaler(), ) X_train = transformer.fit_transform(X_train) X_valid = transformer.transform(X_valid) if validate else None input_nodes = X_train.shape[1] output_nodes = 1 model = Sequential() model.add(Input((input_nodes,))) model.add(Dense(64, activation="relu")) model.add(Dropout(0.3, seed=0)) model.add(Dense(32, activation="relu")) model.add(Dropout(0.3, seed=1)) model.add(Dense(16, activation="relu")) model.add(Dropout(0.3, seed=2)) model.add(Dense(output_nodes)) model.compile(optimizer="adam", loss="mean_squared_error") history = model.fit( X_train, y_train, batch_size=batch_size, epochs=100, validation_data=(X_valid, y_valid) if validate else None, verbose=1, ) return history.history, model, transformer
def scale_vars(df: pd.DataFrame, mapper: DataFrameMapper = None) -> skp.DataFrameMapper: """ Returns a mapper to scale variables. """ warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: # apply standardscaler to columns map_f = [( [column], StandardScaler() ) for column in df.columns if is_numeric_dtype(df[column])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def make_predictions(PATH_TO_DATA, *arg, **kwargs): try: df = pd.read_csv(PATH_TO_DATA, usecols=FEATURES) except InputError: print('Not valid data for predictions') df = feature_engineering(df) for n_model in MODEL_NAMES: mapper = DataFrameMapper(map_features(df.columns), df_out=True) preds = MODELS[n_model].predict(mapper.transform(df)) df['Preds ' + n_model] = np.exp(preds) return df
def normed_data(df_train, df_test): """ Define the structure of the neural network for a Cox-MLP (CC), CoxTime and DeepHit # Arguments df_train: Training set of simulated data with 20 entry variables, survival status and survival time. df_test: Test set of simulated data with 20 entry variables, survival status and survival time. # Returns x_train: dataframe with the normalized explanatory variables. x_test: dataframe with the normalized explanatory variables. """ col_list = list(df_train.columns) cols_standardize = [e for e in col_list if e not in ['yy', 'status']] standardize = [([col], StandardScaler()) for col in cols_standardize] x_mapper = DataFrameMapper(standardize) x_train = x_mapper.fit_transform(df_train).astype('float32') x_test = x_mapper.transform(df_test).astype('float32') return x_train, x_test
def _scale(df, mapper=None): ''' =============== ==================================================================== **Argument** **Description** --------------- -------------------------------------------------------------------- df DataFrame to be scaled. mapper Parameters used for scaling. =============== ==================================================================== :return: mapper if passed as None ''' if mapper is None: map_f = [([n], StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def transform(observations): """ - Convert Sex to boolean male indicator - Create train / test split - Create SKLearn-Pandas mapper - Train SKLearn - Transform train and test data :param observations: :type observations: pandas.DataFrame :return: """ logging.info('Begin transform') # Convert Sex field into boolean male indicator observations['male'] = observations['Sex'] == 'male' logging.info('Converted Sex to binary class. Value counts: {}'.format( observations['male'].value_counts())) # Split into train / test split mask = numpy.random.rand(len(observations)) < 0.8 observations_train = observations[mask] observations_test = observations[~mask] logging.info('Creating dataframe mapper') mapper = DataFrameMapper([(['Age'], [Imputer(), StandardScaler()]), (['SibSp'], [Imputer(), StandardScaler()]), (['Parch'], [Imputer(), StandardScaler()]), (['male'], [Imputer(strategy='most_frequent')])]) logging.info('Fitting and transforming training data set') x_train = mapper.fit_transform(observations_train) y_train = observations_train['Survived'].values logging.info('Transforming response data set') x_test = mapper.transform(observations_test) y_test = observations_test['Survived'].values # Archive & return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return x_train, x_test, y_train, y_test, mapper
def test_fit_with_required_y_arg(complex_dataframe): """ Transformers with a required y argument in the fit method are handled and perform correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))]) # fit, doesn't fail ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target']) # fit_transform ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target']) assert_array_equal(ft_arr, df[['feat1']].values) # transform t_arr = mapper.transform(df[['feat1', 'feat2']]) assert_array_equal(t_arr, df[['feat1']].values)
def prepare_pseudobs_simu(df_train, y_train, df_test,name): """ Prepare the data for training The input data is formated so that one line corresponds to one subject at a particular time point. # Arguments df_train: the entire dataset (input + survival times + event status) y_train: the pseudo-values computed according to the method chosen. df_test: the entire dataset (input + survival times + event status) # Returns x_train_all: input data with all input variables + time variable and one line represents one subject at one time point. y_train_all: pseudo-values computed according to the method chosen. x_test_all: input data with all input variables + time variable and one line represents one subject at one time point. y_test_all: survival time and event status. n_picktime: the number of time point at which the pseudo-observations are computed. """ y_test_all = df_test[['yy','status']] n_picktime = int(y_train[['s']].apply(pd.Series.nunique)) x_test = df_test.drop(['yy','status'], axis = 1) x_test_all = pd.concat([x_test]*n_picktime) time_test = pd.DataFrame(np.repeat(np.unique(y_train[['s']]),len(x_test))) x_test_all.reset_index(inplace=True, drop=True) x_test_all = pd.concat([x_test_all, time_test], axis = 1) if name!= "pseudo_discrete": x_train = df_train.drop(['yy','status'], axis = 1) x_train_all = pd.concat([x_train]*n_picktime) x_train_all.reset_index(inplace=True, drop=True) x_train_all = pd.concat([x_train_all, y_train[['s']]], axis = 1) y_train_all = y_train[['pseudost']] else: x_train = df_train.drop(['yy','status'], axis = 1) x_train['id'] = np.arange(len(x_train)) + 1 x_train = x_train.merge(y_train, left_on='id', right_on='id') x_train_all = x_train.drop(['id','pseudost'], axis = 1) y_train_all = x_train['pseudost'] # Data normalization col_list = list(x_train_all.columns) x_test_all.columns = col_list cols_standardize = [e for e in col_list] standardize = [([col], StandardScaler()) for col in cols_standardize] x_mapper = DataFrameMapper(standardize, df_out=True) x_train_all = x_mapper.fit_transform(x_train_all).astype('float32') x_test_all = x_mapper.transform(x_test_all).astype('float32') return(x_train_all, y_train_all, x_test_all, y_test_all, n_picktime)
def data_simple_imputer(data_train, numeric_feature, category_feature, numeric_strategy='mean', category_strategy='most_frequent', data_test=None): ''' 使用DataFrameMapper进行简单的缺失值填补 指定类别型变量和连续型变量 并指定各自的填充策略 data_train: 需要进行转换的训练集 numeric_feature: 需要处理的数值型变量 category_feature: 需要处理的类别型变量 numeric_strategy: 连续型变量的填补策略 默认是均值 category_strategy: 类别型变量的填补策略 默认是众数 data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换 return: X_train_imputed 添补完成的训练数据 miss_transfer 训练好的DataFrameMapper类 X_test_imputed 添补完成的测试数据 只有在给定测试数据的时候才会使用 ''' print('开始缺失值填充'.center(50, '=')) ##从dict里面把特征list拿出来 print('类别特征数', len(category_feature)) print('数值特征数', len(numeric_feature)) ##数值列和类别列用指定的方法填充 miss_transfer = DataFrameMapper([ (numeric_feature, [SimpleImputer(strategy=numeric_strategy)]), (category_feature, [SimpleImputer(strategy=category_strategy)]) ]) ##进行fit和transform X_train_imputed = miss_transfer.fit_transform(data_train[numeric_feature + category_feature]) X_train_imputed = pd.DataFrame(X_train_imputed, columns=numeric_feature + category_feature) print('train_mapper完成:', X_train_imputed.shape) ##如果测试数据不为空 那么对测试数据进行transform 并返回 if data_test is not None: X_test_imputed = miss_transfer.transform(data_test[numeric_feature + category_feature]) X_test_imputed = pd.DataFrame(X_test_imputed, columns=numeric_feature + category_feature) return X_train_imputed, miss_transfer, X_test_imputed return X_train_imputed, miss_transfer
def readDataset(self): train_df = pd.read_csv(self.trainFile) test_df = pd.read_csv(self.testFile) #print(train_df.columns) #print(train_df.head()) #print(test_df.columns) self.test_index = test_df.Id train_df = train_df.astype(float) test_df = test_df.astype(float) #print(train_df.iloc[0].values) mapper = DataFrameMapper([ ([ 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points' ], MinMaxScaler()), ([ 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40' ], None) ]) self.X_train = mapper.fit_transform(train_df) # print(X_train[0:2,:]) self.y_train = train_df.Cover_Type.values # print(y_train[0:10]) self.X_test = mapper.transform(test_df)
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch): """ If input_df is True, the subsequent transformers get passed pandas objects instead of numpy arrays (given the previous transformers output pandas objects as well) """ df = simple_dataframe monkeypatch.setattr(MockTClassifier, 'fit', Mock()) monkeypatch.setattr(MockTClassifier, 'transform', Mock(return_value=pd.Series([1, 2, 3]))) mapper = DataFrameMapper( [('a', [MockXTransformer(), MockTClassifier()])], input_df=True) mapper.fit(df) out = mapper.transform(df) args, _ = MockTClassifier().fit.call_args assert isinstance(args[0], pd.Series) assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch): """ If input_df is True, the subsequent transformers get passed pandas objects instead of numpy arrays (given the previous transformers output pandas objects as well) """ df = simple_dataframe monkeypatch.setattr(MockTClassifier, 'fit', Mock()) monkeypatch.setattr(MockTClassifier, 'transform', Mock(return_value=pd.Series([1, 2, 3]))) mapper = DataFrameMapper([ ('a', [MockXTransformer(), MockTClassifier()]) ], input_df=True) mapper.fit(df) out = mapper.transform(df) args, _ = MockTClassifier().fit.call_args assert isinstance(args[0], pd.Series) assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
class scale_vars(TBStep): def __init__(self, features=None): warnings.filterwarnings( 'ignore', category=sklearn.exceptions.DataConversionWarning) self.features = features def __repr__(self): return 'scale features' def fit(self, df): if self.features is None: self.features = df.columns self.features = [i for i in self.features if is_numeric_dtype(df[i])] map_f = [([n], StandardScaler()) for n in df[self.features].columns] self.mapper = DataFrameMapper(map_f).fit( df[self.features].dropna(axis=0)) def transform(self, df): df = df.copy() df[self.mapper.transformed_names_] = self.mapper.transform( df[self.features]) return df
def test_integration(none_value): df = pd.DataFrame({'cat': ['a', 'a', 'a', none_value, 'b'], 'num': [1, 2, 3, 4, 5]}) mapper = DataFrameMapper([ ('cat', CategoricalImputer()), ('num', None) ], df_out=True).fit(df) df_t = mapper.transform(df) assert pd.notnull(df_t).all().all() val_idx = pd.notnull(df['cat']) nan_idx = ~val_idx assert (df['num'] == df_t['num']).all() assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all() assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()
class MyMapper(): def __init__(self): pass def fit(self, X, y=None): self.ncols = [] self.scols = [] # print("mapping features") for col in X: if X[col].dtype == float: # print("numerical col: %s" % col) self.ncols.append([col]) else: # print("categorical col: %s" % col) self.scols.append([col]) nfeats = gen_features(columns=self.ncols, classes=[{ 'class': sklearn.preprocessing.MinMaxScaler, }]) sfeats = gen_features(columns=self.scols, classes=[{ 'class': LabelBinarizer2 }]) self.mapper = DataFrameMapper(nfeats + sfeats, df_out=True) self.mapper.fit(X) # print("features mapped") return self def transform(self, X, y=None): X = X.copy() X = self.mapper.transform(X) return X def fit_transform(self, X, y=None): self.fit(X) return self.transform(X)
df_train = df_train.drop(df_test.index) df_val = df_train.sample(frac=0.2) df_train = df_train.drop(df_val.index) standardize = [([col], StandardScaler()) for col in cols_standardize] leave = [(col, None) for col in cols_leave] categorical = [(col, OrderedCategoricalLong()) for col in cols_categorical] x_mapper_float = DataFrameMapper(standardize + leave) x_mapper_long = DataFrameMapper(categorical) x_fit_transform = lambda df: tt.tuplefy( x_mapper_float.fit_transform(df).astype(np.float32), x_mapper_long.fit_transform(df)) x_transform = lambda df: tt.tuplefy( x_mapper_float.transform(df).astype(np.float32), x_mapper_long.transform(df)) x_train = x_fit_transform(df_train) x_val = x_transform(df_val) x_test = x_transform(df_test) num_embeddings = x_train[1].max(0) + 1 embedding_dims = num_embeddings // 2 get_target = lambda df: (df['duration'].values, df['event'].values) y_train = get_target(df_train) y_val = get_target(df_val) durations_test, events_test = get_target(df_test) val = x_val, y_val # Model preparation =============================================================
class Dataprocess(object): datafile = "data.csv" def __init__(self, datadir="/Users/shintaro/work/kaggle-kobe/data/"): self.datadir = datadir def read(self): self.df_orig = pd.read_csv(self.datadir + self.datafile) self.df = self.df_orig.copy() def process(self): self.read() self.preproc() self.set_mapper() self.split_df() train_X = self.vec_X(self.train_df) train_y = self.vec_y(self.train_df) test_X = self.mapper_X.transform(self.test_df) return train_X, train_y, test_X def preproc(self): self.df["time_remaining"] = self.df["minutes_remaining"] * 60 + self.df["seconds_remaining"] self.df['last_5_sec'] = self.df['time_remaining'] < 5 self.df['latter_half'] = self.df['time_remaining'] < 360 self.df['first_period'] = self.df['period'] == 1 self.df['latter_period'] = self.df['period'] > 2 self.df['last_period'] = self.df['period'] == 4 self.df['last_quarter'] = self.df['time_remaining'] < 180 threshold = 3 anomaly = 14 self.df['last_moment'] = self.df.apply(lambda row: row['time_remaining'] < threshold or row['time_remaining'] == anomaly, axis=1) self.df['away'] = self.df.matchup.str.contains('@') self.df['secondsFromStart'] = 60 * (11 - self.df['minutes_remaining']) + (60 - self.df['seconds_remaining']) self.df['secondsFromGameStart'] = (self.df['period'] <= 4).astype(int) * (self.df['period'] - 1) * 12 * 60 + (self.df['period'] > 4).astype(int) * ((self.df['period'] - 4) * 5 * 60 + 3 * 12 * 60) + self.df['secondsFromStart'] numGaussians = 13 gaussianMixtureModel = mixture.GMM(n_components=numGaussians, covariance_type='full', params='wmc', init_params='wmc', random_state=1, n_init=3, verbose=0) gaussianMixtureModel.fit(self.df.ix[:,['loc_x','loc_y']]) self.df['shotLocationCluster'] = gaussianMixtureModel.predict(self.df.ix[:,['loc_x','loc_y']]) self.df['homeGame'] = self.df['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0) self.df["game_year"] = pd.Series([int(self.df["game_date"][i][:4]) for i in range(0, len(self.df))]) self.df["game_month"] = pd.Series([int(self.df["game_date"][i][5:7]) for i in range(0, len(self.df))]) self.df["game_day"] = pd.Series([int(self.df["game_date"][i][-2:]) for i in range(0, len(self.df))]) action_type_list = list(set(self.df["action_type"].tolist())) self.df["action_type_num"] = pd.Series([action_type_list.index(self.df["action_type"][i]) for i in range(0, len(self.df))]) combined_shot_type_list = list(set(self.df["combined_shot_type"].tolist())) self.df["combined_shot_type_num"] = pd.Series([combined_shot_type_list.index(self.df["combined_shot_type"][i]) for i in range(0, len(self.df))]) opponent_list = list(set(self.df["opponent"].tolist())) self.df["opponent_num"] = pd.Series([opponent_list.index(self.df["opponent"][i]) for i in range(0, len(self.df))]) game_id_list = list(set(self.df["game_id"].tolist())) self.df["game_id_num"] = pd.Series([game_id_list.index(self.df["game_id"][i]) for i in range(0, len(self.df))]) season_list = list(set(self.df["season"].tolist())) season_list.sort() self.df["season_num"] = pd.Series([season_list.index(self.df["season"][i]) for i in range(0, len(self.df))]) self.df["shot_distance"][self.df["shot_distance"] > 45] = 45 # del self.df["team_id"], self.df["team_name"], self.df["game_event_id"], self.df["lat"], self.df["lon"] # return self.df def set_mapper(self): self.mapper_X = DataFrameMapper([ (u'action_type', LabelBinarizer()), (u'combined_shot_type', LabelBinarizer()), (u'loc_x', None), (u'loc_y', None), (u'minutes_remaining', None), (u'period', LabelBinarizer()), (u'playoffs', LabelBinarizer()), (u'season', LabelBinarizer()), (u'seconds_remaining', None), (u'shot_distance', None), (u'shot_type', LabelBinarizer()), (u'shot_zone_area', LabelBinarizer()), (u'shot_zone_basic', LabelBinarizer()), (u'shot_zone_range', LabelBinarizer()), (u'matchup', LabelBinarizer()), (u'shot_id', None), (u'season_num', None), (u'game_year', None), (u'game_month', None), (u'game_day', None), (u'first_period', LabelBinarizer()), (u'latter_period', LabelBinarizer()), (u'last_period', LabelBinarizer()), (u'last_quarter', LabelBinarizer()), (u'time_remaining', None), (u'latter_half', LabelBinarizer()), (u'last_5_sec', LabelBinarizer()), (u'opponent_num', LabelBinarizer()), (u'game_id_num', LabelBinarizer()), (u'last_moment', LabelBinarizer()), (u'away', LabelBinarizer()), (u'secondsFromStart', None), (u'secondsFromGameStart', None), (u'shotLocationCluster', LabelBinarizer()), (u'homeGame', LabelBinarizer()), ]) self.mapper_y = DataFrameMapper([(u'shot_made_flag', None),]) self.mapper_X.fit(self.df) self.mapper_y.fit(self.df) def split_df(self): self.train_df = self.df[~np.isnan(self.df["shot_made_flag"])] self.test_df = self.df[np.isnan(self.df["shot_made_flag"])] def vec_X(self, df): return self.mapper_X.transform(df.copy()) def vec_y(self, df): return self.mapper_y.transform(df.copy())
def create_LabelBinarized_files() : """ Apply LabelBinarizing to the data to create: A file with test coupon information (and LabelBinarizing of categorical variables) A file which aggregates coupon_detail and user information (and LabelBinarizing of categorical variables) A file which aggregates coupon_visit and user information (and LabelBinarizing of categorical variables) These files will be used in the similarity_distance.py script """ print "Create Label Binarized files" def get_unix_time(row): """Convert to unix time. Neglect time of the day """ row = row.split(" ") row = row[0].split("-") y,m,d = int(row[0]), int(row[1]), int(row[2]) return calendar.timegm(date(y,m,d).timetuple()) #read in all the input data cpdtr = pd.read_csv("../Data/Data_translated/coupon_detail_train_translated.csv") cpltr = pd.read_csv("../Data/Data_translated/coupon_list_train_translated.csv") cplte = pd.read_csv("../Data/Data_translated/coupon_list_test_translated.csv") ulist = pd.read_csv("../Data/Data_translated/user_list_translated.csv") ulist["REG_DATE_UNIX"] = ulist["REG_DATE"].apply(get_unix_time) # List of unbinarized features list_col_unbin = ["COUPON_ID_hash","USER_ID_hash", "GENRE_NAME", "large_area_name", "small_area_name", "PRICE_RATE", "CATALOG_PRICE", "DISCOUNT_PRICE", "DISPFROM", "DISPEND", "DISPPERIOD", "VALIDFROM", "VALIDEND", "VALIDPERIOD", "USABLE_DATE_MON", "USABLE_DATE_TUE", "USABLE_DATE_WED", "USABLE_DATE_THU", "USABLE_DATE_FRI", "USABLE_DATE_SAT", "USABLE_DATE_SUN", "USABLE_DATE_HOLIDAY", "USABLE_DATE_BEFORE_HOLIDAY", "ITEM_COUNT", "AGE", "SEX_ID", "REG_DATE_UNIX"] #making of the train set train = pd.merge(cpdtr, cpltr) train = pd.merge(train, ulist, left_on = "USER_ID_hash", right_on = "USER_ID_hash") train = train[list_col_unbin] # Format the test set as the train set cplte["USER_ID_hash"] = np.array(["dummyuser"]*len(cplte)) for col in ["ITEM_COUNT", "AGE", "SEX_ID", "REG_DATE_UNIX"] : cplte[col] = 0 #Then combine test and train cpchar = cplte[list_col_unbin] train = pd.concat([train, cpchar]) # Binarize features now list_to_binarize = ["GENRE_NAME", "large_area_name", "small_area_name"] # After binarisation, we obtain more features. We store the name of those features in d_bin d_bin = {} for feat in list_to_binarize: if feat == "GENRE_NAME" : cardinal = sorted(set(train[feat].values)) d_bin["GENRE_NAME"] = [feat + "_" + str(i) for i in cardinal] if feat == "large_area_name" : cardinal = sorted(set(train[feat].values)) d_bin["large_area_name"] = [feat + "_" + str(i) for i in cardinal] if feat == "small_area_name" : cardinal = sorted(set(train[feat].values)) d_bin["small_area_name"] = [feat + "_" + str(i) for i in cardinal] # Use a sklearn_pandas mapper for binarization list_mapper = [] # Store binaried col names in new list list_col_bin = [] for feat in list_col_unbin : if feat in list_to_binarize : list_col_bin += d_bin[feat] list_mapper.append((feat, preprocessing.LabelBinarizer())) else : list_col_bin.append(feat) list_mapper.append((feat, None)) mapper = DataFrameMapper(list_mapper) # Fit binarizer of full matrix and save train = mapper.fit_transform(train) # Incorporate binarized feature in train train = pd.DataFrame(train, index = None, columns = list_col_bin ) #separate the test from train test = train[train["USER_ID_hash"]=="dummyuser"] train = train[train["USER_ID_hash"] !="dummyuser"] #Save the test data test.to_csv("../Data/Data_translated/coupon_list_test_LB_translated.csv", index = False) #Free memory del test #Save the train data train.to_csv("../Data/Data_translated/coupon_train_aggregated_LB_translated.csv", index = False) #Free memory del train #Load visit data frame in chunks because it is too large for index, cpvtr in enumerate(pd.read_csv("../Data/Data_translated/coupon_visit_train_translated.csv", chunksize=100000)) : sys.stdout.write("\rProcessing row " + str(index*100000)+" to row "+str((index+1)*100000)) sys.stdout.flush() cpvtr = cpvtr[cpvtr["PURCHASE_FLG"]!=1][["VIEW_COUPON_ID_hash","USER_ID_hash"]] trainv = pd.merge(cpvtr, cpltr, left_on = "VIEW_COUPON_ID_hash", right_on = "COUPON_ID_hash") trainv = pd.merge(trainv, ulist, left_on = "USER_ID_hash", right_on = "USER_ID_hash") trainv["ITEM_COUNT"] = 0 trainv = trainv[list_col_unbin] #Binarize trainv = mapper.transform(trainv) trainv = pd.DataFrame(trainv, index = None, columns = list_col_bin ) # Add trainv to trainvisit if index == 0: with open("../Data/Data_translated/coupon_trainv_aggregated_LB_translated.csv", "w") as f : trainv.to_csv(f, index = False) else : with open("../Data/Data_translated/coupon_trainv_aggregated_LB_translated.csv", "a") as f : trainv.to_csv(f, index = False, header=False) print
Test = Test.groupby(['borough', 'month'])['complaints'].mean().reset_index() X_train = Train[['borough', 'month']] y_train = Train['complaints'] X_test = Test[['borough', 'month']] y_test = Test['complaints'] # Features # convert the categorical varibale to binary variables mapper = DataFrameMapper([('month', None), ('borough', LabelBinarizer())], df_out=True) # preprocessing features data sets Z_train = mapper.fit_transform(X_train) Z_test = mapper.transform(X_test) model = LinearRegression(normalize=True) model.fit(Z_train, y_train) model.score(Z_train, y_train) y_pred = model.predict(Z_test) RSS = ((y_test - y_pred)**2).sum() TSS = ((y_train.mean() - y_test)**2).sum() R2 = 1.0 - RSS / TSS print("Model performance R^2 = {}".format(R2)) print("Baseline model prediction {}".format(y_train.mean()))
class trainingData(object): def __init__(self, data): self.__df = data self.__col_list = self.__df.columns.values.tolist() self.__del_col_list=[] self.__cat_col_list=[] self.__final_col_list = [] def plot_corr(self): corMat = DataFrame(self.__df.corr()) plot.pcolor(corMat) plot.show() def getColNmaes(self): return self.__col_list def preprocess(self,predictors, target, unique = 3): assert str(target) """ :param target: variable to be predicted :param unique: tolerance for number of unique values in categorial columns. \ If unique count in categorial columns is greater than this count those particular columns are dropped :return: none """ for name in self.__col_list: if name == target: self.__temp = name if self.__df[name].dtype == 'O': if len(self.__df[name].unique()) > unique: self.__del_col_list.append(self.__col_list.index(name)) else: self.__cat_col_list.append(name) if self.__df[name].dtype == 'int64': self.__df[name] = self.__df[name].astype(float) #templist = [] for value in self.__col_list: if value not in predictors and self.__col_list.index(value) not in self.__del_col_list and value != target: self.__del_col_list.append(self.__col_list.index(value)) #drop unwqanted columns self.__df.drop(self.__df.columns[self.__del_col_list],axis=1,inplace=True) #drop null values self.__df.dropna(axis=1,how='any',inplace = True) #prepare target df self.__target_df= self.__df[self.__temp] self.__df.drop(self.__temp,axis = 1, inplace = True) #train test split self.trainX ,self.testX, self.trainY, self.testY = sklearn.cross_validation.train_test_split(self.__df,self.__target_df,test_size=0.30) #get final column list for mappers self.__final_col_list = self.__df.columns.values.tolist() self.__num_col_list = [item for item in self.__final_col_list if item not in self.__cat_col_list] #print self.num_col_list self.mapfunc = [] for name in self.__final_col_list: if self.__df[name].dtype == "O": self.mapfunc.append(([name],sklearn.preprocessing.LabelBinarizer())) else: self.mapfunc.append(([name], sklearn.preprocessing.StandardScaler(copy=False))) #io mappers self.in_mapper = DataFrameMapper(self.mapfunc) self.out_mapper = sklearn.preprocessing.RobustScaler(with_centering=False,copy=False) self.trainX = np.array(self.in_mapper.fit_transform(self.trainX),np.float32) self.trainY = np.array(self.out_mapper.fit_transform(self.trainY.reshape(-1,1)),np.float32) self.testX = np.array(self.in_mapper.transform(self.testX),np.float32) self.testY = np.array(self.out_mapper.transform(self.testY.reshape(-1,1)),np.float32) self.tindex = self.trainX.shape[0] def expt(self,name): """Export train or test Files...for debugging purposes """ if name == "trainX": __df = pd.DataFrame(self.trainX) __df.to_csv("trainX.csv") elif name == "trainY": __df = pd.DataFrame(self.trainY) __df.to_csv("trainY.csv") elif name == "testX": __df = pd.DataFrame(self.testX) __df.to_csv("testX.csv") elif name == "testY": __df = pd.DataFrame(self.testX) __df.to_csv("testX.csv") else: raise ValueError
x=train[feasible_columns].drop(columns_to_drop, axis=1).fillna(0) x_test=test[feasible_columns].drop(columns_to_drop, axis=1).fillna(0) x_result=predict[feasible_columns].drop(columns_to_drop, axis=1).fillna(0) x_result=x_result.loc[[i in [46,58,70,82,94,106,118,130,142] for i in x_result.time_id],:] x_final=x_result.reset_index(drop=True) a=x.columns mapper=[] for j in a: if j in ['district_id', 'Day', 'Weekday', 'Workday', 'Yesterday_Workday','Twoday_ago_Workday', 'time_id']: mapper.append((j,None)) else: mapper.append((j,StandardScaler())) b=DataFrameMapper(mapper) b.fit(pd.concat([x, x_test, x_result])) x=b.transform(x) x_test=b.transform(x_test) x_result_before = x_result x_result=b.transform(x_result) #Random Forest clf = ensemble.RandomForestClassifier(n_estimators=20,max_features=min(len(feasible_columns) - len(columns_to_drop), 25)) clf.fit(x,y) clf_predict=clf.predict(x_test) clf_score=clf.score(x_test, y_test) clf_predict.fill(1) diff=clf_predict-y_test MAPE=sum(abs(diff[y_test!=0]/y_test[y_test!=0])/len(y_test))
def __load_coupons(self, validation_timedelta): train_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_train.csv"), parse_dates=["DISPFROM","DISPEND"]) test_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_test.csv")) train_coupon_df["DISPFROM"].fillna(pd.Timestamp("19000101"), inplace=True) train_coupon_df = train_coupon_df.sort(columns=["DISPFROM"]).reset_index(drop=True) if validation_timedelta: max_date = train_coupon_df["DISPFROM"].max() valid_start = max_date - validation_timedelta valid_coupon_df = train_coupon_df[(train_coupon_df["DISPFROM"] > valid_start)] train_coupon_df = train_coupon_df[~ (train_coupon_df["DISPFROM"] > valid_start)] else: valid_coupon_df = train_coupon_df[np.zeros(len(train_coupon_df), dtype=np.bool)].copy() # remove outlier data from the validation-set if len(valid_coupon_df) > 0: very_low_price = valid_coupon_df[valid_coupon_df.DISCOUNT_PRICE <= 100].COUPON_ID_hash very_long_time_display = valid_coupon_df[valid_coupon_df.DISPPERIOD > 20].COUPON_ID_hash valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_long_time_display)] valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_low_price)].reset_index(drop=True) # remove outlier data from the training-set very_long_time_display = train_coupon_df[train_coupon_df.DISPPERIOD > 20].COUPON_ID_hash train_coupon_df = train_coupon_df[~train_coupon_df.COUPON_ID_hash.isin(very_long_time_display)].reset_index(drop=True) # coupon features coupon_mapper = DataFrameMapper([ ('CATEGORY_NAME', LabelBinarizer()), ('PRICE_RATE', None), ('CATALOG_PRICE_LOG', None), ('DISCOUNT_PRICE_LOG', None), ('REDUCE_PRICE_LOG', None), ('DISPPERIOD_C', LabelBinarizer()), ('VALIDPERIOD_NA', LabelBinarizer()), ('USABLE_DATE_SUM', None), ('LARGE_AREA_NAME', LabelBinarizer()), ('PREF_NAME', LabelBinarizer()), ('SMALL_AREA_NAME', LabelBinarizer()), ]) config = {} self.__coupon_preproc(train_coupon_df) self.__coupon_preproc(valid_coupon_df) self.__coupon_preproc(test_coupon_df) coupon_mapper.fit(pd.concat([train_coupon_df, valid_coupon_df, test_coupon_df])) train_coupon_vec = coupon_mapper.transform(train_coupon_df.copy()) if len(valid_coupon_df) > 0: valid_coupon_vec = coupon_mapper.transform(valid_coupon_df.copy()) else: valid_coupon_vec = np.array([]) test_coupon_vec = coupon_mapper.transform(test_coupon_df.copy()) self.train_coupon_vec = train_coupon_vec self.valid_coupon_vec = valid_coupon_vec self.test_coupon_vec = test_coupon_vec self.train_coupon_df = train_coupon_df self.valid_coupon_df = valid_coupon_df self.test_coupon_df = test_coupon_df
#Types of features binary_features = ['mouseovers', 'viewable'] cat_features = ['placement_id','browser_id','os_id','region','country','campaign','creative_asset_id'] numeric_features = ['hour', 'max_duration', 'video_length'] #Preprocess accordingly mapper = DataFrameMapper([(binary_features, None), (cat_features, OneHotEncoder(handle_unknown='ignore')), (numeric_features, MaxAbsScaler())], sparse=True, ) #Fit to training data only X_train = np.round(mapper.fit_transform(df_train.copy()), 2) #Use same mapper to transform test data X_test = np.round(mapper.transform(df_test.copy()), 2) #Begin cross validation cv = cross_validation.StratifiedKFold(y_train, 10) parameters = { 'alpha': [1e-6, 1e-5, 1e-4, 1e-3] } grid_search = GridSearchCV(SGDClassifier(loss='log', penalty='l1', n_iter=10, shuffle=True), parameters, cv=cv, verbose=True, scoring='f1') grid_search.fit(X_train, y_train) clf = grid_search.best_estimator_ print "Best parameters set:" best_parameters = clf.get_params() for param_name in sorted(parameters.keys()): print "\t%s: %r" % (param_name, best_parameters[param_name])
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) #Preprocess X_train feature_def = gen_features( columns=[[c] for c in X_train.columns[:7]], classes=[MinMaxScaler] ) feature_def += ((pos_col, [LabelBinarizer()]),) svc_preprocessor = DataFrameMapper(feature_def) X_train = svc_preprocessor.fit_transform(X_train) svc_preprocessor_fn = os.path.join('../model/tmp/svc_preprocessor.%s.pkl' % (nrows,)) joblib.dump(svc_preprocessor, open(svc_preprocessor_fn, 'wb')) X_test = svc_preprocessor.transform(X_test) ##### #Didn't help!! #X_train, y_train = downsample_negatives(X_train, y_train) for cv in [1,10,20]: print "Training, sample_count: %s\tcv:%s" % (nrows, cv) clf = svm.SVC(kernel='linear', class_weight={1:cv}) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) measures = metrics.precision_recall_fscore_support(y_test, y_pred, average='binary') model_file_name = os.path.join('../model/tmp/svc_mentions_unbalanced.%s.%s.pkl' % (nrows,cv)) joblib.dump(clf, open(model_file_name, 'wb')) print "measures: ", measures sys.stdout.flush()
def create_mapper(df, cat_vars=list(), cont_vars=list(), date_vars=list(), no_transform_vars=list(), response_vars=list()): logging.info('Creating mapper') # TODO Add support for datetime variables # Reference variables transformation_list = list() # Copy df, to avoid 'inplace' transformation df = df.copy(deep=True) # TODO Check if any df variables are not listed in cat_vars or cont_vars. If so, remove them. # Check if any df variables are listed in cat_vars and cont_vars. If so, raise an error. intersection = filter(lambda x: x in cat_vars, cont_vars) if len(intersection) > 0: raise AssertionError('Columns appear in both cat_vars and cont_vars: {}'.format(intersection)) # Convert continuous variables to float32 for cont_var in cont_vars + response_vars: logging.debug('Converting cont_var data type: {}'.format(cont_var)) df[cont_var] = df[cont_var].astype(numpy.float32) for date_var in date_vars: logging.info('Enriching for datetime var: {}'.format(date_var)) df, date_cat_vars, date_cont_vars = add_datetime_vars(df, date_var) cat_vars.extend(date_cat_vars) cont_vars.extend(date_cont_vars) # Add continuous variable transformations for cont_vars for cont_var in cont_vars + response_vars: logging.debug('Creating transformation list for cont_var: {}'.format(cont_var)) transformations = [Imputer(strategy='mean'), StandardScaler()] var_tuple = ([cont_var], transformations) transformation_list.append(var_tuple) # Add categorical variable transformations for cat_vars for cat_var in cat_vars: logging.debug('Creating transformation list for cat_var: {}'.format(cat_var)) # TODO Replace LabelEncoder with CategoricalEncoder, to better handle unseen cases transformations = [LabelEncoder()] var_tuple = (cat_var, transformations) transformation_list.append(var_tuple) for no_transform_var in no_transform_vars: logging.debug('Creating transformation list for cont_var: {}'.format(no_transform_var)) transformations = [Imputer(strategy='most_frequent')] var_tuple = ([no_transform_var], transformations) transformation_list.append(var_tuple) # Create mapper logging.info('Creating mapper') mapper = DataFrameMapper(features=transformation_list, df_out=True) # Train mapper logging.info('Training newly created mapper') mapper.fit(df) # Throw away transformation, to set up mapper logging.info('Transforming data set with newly created mapper, to initialize mapper internals') mapper.transform(df.sample(1000)) return mapper
import pandas as pd import numpy as np import seaborn as sn import sklearn from sklearn_pandas import DataFrameMapper, cross_val_score import re df_train = pd.read_csv('./train.csv') df_test = pd.read_csv('./test.csv') feats = [key for key in df.keys() if re.match('.*feat.*',key)] mapper = DataFrameMapper([(feats,sklearn.preprocessing.StandardScaler())]) data_train_scaled = mapper.fit_transform(df_train) data_test_scaled = mapper.transform(df_test) data_test = df_test[feats] data_train =df_train[feats]