def get_categorical_imputers(df, features): feature_defs = [] for col_name in features: feature_defs.append((col_name, CategoricalImputer())) multi_imputer = DataFrameMapper(feature_defs, input_df=True, df_out=True) multi_imputer.fit(df[features]) return multi_imputer
def mapper(df): x_mapper = DataFrameMapper([ (u'action_type', LabelBinarizer()), (u'combined_shot_type', LabelBinarizer()), (u'loc_x', None), (u'loc_y', None), (u'minutes_remaining', None), (u'period', LabelBinarizer()), (u'playoffs', LabelBinarizer()), (u'seconds_remaining', None), (u'shot_distance', None), (u'shot_type', LabelBinarizer()), (u'shot_zone_area', LabelBinarizer()), (u'shot_zone_basic', LabelBinarizer()), (u'shot_zone_range', LabelBinarizer()), (u'matchup', LabelBinarizer()), (u'shot_id', None), (u'time_remaining', None), (u'opponent_num', LabelBinarizer()), (u'game_id_num', LabelBinarizer()), ]) x_mapper.fit(df) y_mapper = DataFrameMapper([ (u'shot_made_flag', None), ]) y_mapper.fit(df) return x_mapper, y_mapper
def fg_categorical_categorical(train_categorical, test_categorical): # 列号到列名的映射 # index_to_name = dict(zip( # list(range(len(train_categorical.columns))), # list(train_categorical.columns) # )) # 用常量代替 , 因为后续 train_categorical.columns 是会变化的 columns_length = len(train_categorical.columns) columns_header = list(train_categorical.columns) for r in range(2, columns_length+1): # itertools.combinations column 的组合 for combination in itertools.combinations(columns_header, r): train_categorical["_".join(combination)] = train_categorical[combination[0]].astype(str) test_categorical["_".join(combination)] = test_categorical[combination[0]].astype(str) for c in combination[1:]: train_categorical["_".join(combination)] = \ train_categorical["_".join(combination)] + "_" + train_categorical[c].astype(str) test_categorical["_".join(combination)] = \ test_categorical["_".join(combination)] + "_" + test_categorical[c].astype(str) # 对每一列使用 LabelEncoder 但是有个问题如上这种生成分类变量的方式容易使得测试集中某分类变量的水平测试集中没有出现报错 mapper = DataFrameMapper([(i, LabelEncoder()) for i in train_categorical.columns]) mapper.fit(train_categorical) train_categorical_new = pd.DataFrame(mapper.transform(train_categorical), columns=train_categorical.columns) test_categorical_new = pd.DataFrame(mapper.transform(test_categorical), columns=test_categorical.columns) return train_categorical_new, test_categorical_new
def mapper(df): df["time_remaining"] = df["minutes_remaining"] * 60 + df[ "seconds_remaining"] x_mapper = DataFrameMapper([ (u'loc_x', None), (u'loc_y', None), (u'minutes_remaining', None), (u'period', None), (u'seconds_remaining', None), (u'shot_distance', None), (u'playoffs', LabelBinarizer()), (u'action_type_num', None), (u'combined_shot_type_num', None), (u'season_num', None), (u'shot_type_num', None), (u'shot_zone_area_num', None), (u'shot_zone_basic_num', None), (u'shot_zone_range_num', None), (u'matchup', LabelBinarizer()), (u'shot_id', None), (u'opponent_num', None), (u'time_remaining', None), (u'last_moment', None), (u'hna', None), (u'action1', None), (u'action2', None), (u'action3', None), (u'action4', None), (u'action5', None), (u'action6', None), (u'action8', None), (u'action9', None), (u'action10', None), (u'action11', None), (u'action13', None), (u'action14', None), (u'action15', None), (u'action16', None) ]) x_mapper.fit(df) y_mapper = DataFrameMapper([ (u'shot_made_flag', None), ]) y_mapper.fit(df) return x_mapper, y_mapper
def get_mapper(data_all): param_list = [ ('id', None), ('major', LabelEncoder()), ('age', None), ('gender', LabelEncoder()), ('isenglish', None), ('isjunior', None), ('isbachelor', None), ('ismaster', None), ('isintern', None), ('total_previous_job', None), ('last_type', LabelEncoder()), ('last_type1', LabelEncoder()), ('last_department', LabelEncoder()), ('last_size', None), ('last_salary', None), ('last_industry', LabelEncoder()), ('last_position_name', LabelEncoder()), ('last_start_year', None), ('last_start_month', None), ('last_end_year', None), ('last_end_month', None), ('last_interval_month', None), ('third_type', LabelEncoder()), ('third_type1', LabelEncoder()), ('third_department', LabelEncoder()), ('third_size', None), ('third_salary', None), ('third_industry', LabelEncoder()), ('third_position_name', LabelEncoder()), ('third_start_year', None), ('third_start_month', None), ('third_end_year', None), ('third_end_month', None), ('third_interval_month', None), ('first_type', LabelEncoder()), ('first_type1', LabelEncoder()), ('first_department', LabelEncoder()), ('first_size', None), ('first_salary', None), ('first_industry', LabelEncoder()), ('first_position_name', LabelEncoder()), ('first_start_year', None), ('first_start_month', None), ('first_end_year', None), ('first_end_month', None), ('first_interval_month', None), ('last3_interval_month', None), ('diff_last3_salary', LabelEncoder()), ('diff_last3_size', LabelEncoder()), ('diff_last3_industry', LabelEncoder()), ('diff_last3_position_name', LabelEncoder()), ('total_interval_month', None), ('diff_salary', LabelEncoder()), ('diff_size', LabelEncoder()), ('diff_industry', LabelEncoder()), ('diff_position_name', LabelEncoder()), ('major_1', LabelEncoder()), ('last_position_name_1', LabelEncoder()), ('last_department_1', LabelEncoder()), ('third_position_name_1', LabelEncoder()), ('third_department_1', LabelEncoder()), ('first_position_name_1', LabelEncoder()), ('first_department_1', LabelEncoder()), ('major_2', LabelEncoder()), ('last_position_name_2', LabelEncoder()), ('last_department_2', LabelEncoder()), ('third_position_name_2', LabelEncoder()), ('third_department_2', LabelEncoder()), ('first_position_name_2', LabelEncoder()), ('first_department_2', LabelEncoder()), ('start_working_age', None), ('rev_working_age', None), ('pre_working_month', None), ('pre_interval_month', None), ("pre_largest_size", None), ("pre_largest_salary", None), ("pre_least_size", None), ("pre_least_salary", None), ("pre_size1", None), ("pre_size2", None), ("pre_size3", None), ("pre_size4", None), ("pre_size5", None), ("pre_size6", None), ("pre_size7", None), ("pre_salary1", None), ("pre_salary2", None), ("pre_salary3", None), ("pre_salary4", None), ("pre_salary5", None), ("pre_salary6", None), ("pre_salary7", None), ("promotion_size", None), ("promotion_salary", None), ("decrease_size", None), ("decrease_salar", None) ] print "the mapper's param list is %s" % (len(param_list)) mapper = DataFrameMapper(param_list) mapper.fit(data_all) return mapper
def test_fit_with_optional_y_arg(complex_dataframe): """ Transformers with an optional y argument in the fit method are handled correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())]) # doesn't fail mapper.fit(df[['feat1', 'feat2']], df['target'])
def test_fit_with_optional_y_arg(complex_dataframe): """ Transformers with an optional y argument in the fit method are handled correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())]) # doesn't fail mapper.fit(df[['feat1', 'feat2']], df['target'])
def pre_processing(self): self.__numeric_header = [ i for i in self.__train_feature.columns if i not in self.__categorical_header ] self.__train_categorical = self.__train_feature[ self.__categorical_header] self.__train_numeric = self.__train_feature[self.__numeric_header] self.__validation_categorical = self.__validation_feature[ self.__categorical_header] self.__validation_numeric = self.__validation_feature[ self.__numeric_header] self.__test_categorical = self.__test_feature[ self.__categorical_header] self.__test_numeric = self.__test_feature[self.__numeric_header] self.__train_categorical = self.__train_categorical.astype(str) self.__validation_categorical = self.__validation_categorical.astype( str) self.__test_categorical = self.__test_categorical.astype(str) self.__train_categorical = self.__train_categorical.fillna("missing") self.__validation_categorical = self.__validation_categorical.fillna( "missing") self.__test_categorical = self.__test_categorical.fillna("missing") # 使用 DataFrameMapper 生成的 DataFrame 舍弃了之前 DataFrame 的 index 需要 set_index mapper = DataFrameMapper([(i, LabelEncoder()) for i in self.__train_categorical.columns]) mapper.fit(self.__train_categorical) self.__train_categorical = pd.DataFrame( mapper.transform(self.__train_categorical), columns=self.__train_categorical.columns).set_index( self.__train_categorical.index) self.__validation_categorical = pd.DataFrame( mapper.transform(self.__validation_categorical), columns=self.__validation_categorical.columns).set_index( self.__validation_categorical.index) self.__test_categorical = pd.DataFrame( mapper.transform(self.__test_categorical), columns=self.__test_categorical.columns).set_index( self.__test_categorical.index) self.__train_numeric = self.__train_numeric.fillna(-999) self.__validation_numeric = self.__validation_numeric.fillna(-999) self.__test_numeric = self.__test_numeric.fillna(-999) self.__train_feature = pd.concat( [self.__train_categorical, self.__train_numeric], axis=1).values self.__validation_feature = pd.concat( [self.__validation_categorical, self.__validation_numeric], axis=1).values self.__test_feature = pd.concat( [self.__test_categorical, self.__test_numeric], axis=1).values
def transform_cat_to_cont(df, cat_features, cont_features): feature_defs = [] for col_name in cat_features: feature_defs.append((col_name, MyLabelBinarizer())) for col_name in cont_features: feature_defs.append((col_name, None)) mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True) mapper.fit(df) return mapper.transform(df)
def test_cols_list_column_vector(simple_dataframe): """ If a one-element list is specified as the columns, the transformer is called with a column vector as input. """ df = simple_dataframe mock_transformer = Mock() mapper = DataFrameMapper([(["a"], mock_transformer)]) mapper.fit(df) args, kwargs = mock_transformer.fit.call_args assert args[0].shape == (3, 1)
def test_cols_string_array(simple_dataframe): """ If a string is specified as the columns, the transformer is called with a 1-d array as input. """ df = simple_dataframe mock_transformer = Mock() mapper = DataFrameMapper([("a", mock_transformer)]) mapper.fit(df) args, kwargs = mock_transformer.fit.call_args assert args[0].shape == (3,)
def test_cols_list_column_vector(simple_dataframe): """ If a one-element list is specified as the columns, the transformer is called with a column vector as input. """ df = simple_dataframe mock_transformer = Mock() mapper = DataFrameMapper([(["a"], mock_transformer)]) mapper.fit(df) args, kwargs = mock_transformer.fit.call_args assert args[0].shape == (3, 1)
def test_cols_string_array(simple_dataframe): """ If a string is specified as the columns, the transformer is called with a 1-d array as input. """ df = simple_dataframe mock_transformer = Mock() mapper = DataFrameMapper([("a", mock_transformer)]) mapper.fit(df) args, kwargs = mock_transformer.fit.call_args assert args[0].shape == (3, )
def test_numerical_transformer_serialization(simple_dataset): """ Test if you can serialize transformer """ transfomer = DataFrameMapper([('feat1', NumericalTransformer('log'))]) df = simple_dataset transfomer.fit(df) f = tempfile.NamedTemporaryFile(delete=True) joblib.dump(transfomer, f.name) transfomer2 = joblib.load(f.name) np.array_equal(transfomer.transform(df), transfomer2.transform(df)) f.close()
def test_exception_column_context_fit(simple_dataframe): """ If an exception is raised when fit a column, the exception includes the name of the column being fitted """ class FailingFitter(object): def fit(self, X): raise Exception('Some exception') df = simple_dataframe mapper = DataFrameMapper([('a', FailingFitter())]) with pytest.raises(Exception, match='a: Some exception'): mapper.fit(df)
def test_mapper(self): data = lib.load_titanic() transformation_list = [(['name'], [EmbeddingVectorizer(max_sequence_length=12)])] mapper = DataFrameMapper(transformation_list, df_out=True) mapper.fit(data) data_transformed = mapper.transform(data) assert_array_equal([2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1], data_transformed.values[0, :])
def test_exception_column_context_fit(simple_dataframe): """ If an exception is raised when fit a column, the exception includes the name of the column being fitted """ class FailingFitter(object): def fit(self, X): raise Exception('Some exception') df = simple_dataframe mapper = DataFrameMapper([('a', FailingFitter())]) with pytest.raises(Exception, match='a: Some exception'): mapper.fit(df)
def transform_features(self): totransform = [] for index, item in enumerate(self.feat_head): field = item[0] func_name = item[1] transform = item[2] is_enable = item[3] if is_enable: if not field in self.stumble_data.get_features(): print 'field not in feature..generating:' + field func_name(field) totransform.append((field, transform)) if len(totransform): mapper = DataFrameMapper(totransform) mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train]) # X_transformed_train = mapper.transform( self.stumble_data.all_pd[:self.stumble_data.len_train]) X_transformed_test = mapper.transform( self.stumble_data.all_pd[self.stumble_data.len_train:]) for index, item in enumerate(self.feat_head): field = item[0] is_enable = item[3] if is_enable and field in self.stumble_data.get_features(): del self.stumble_data.all_pd[field] import pdb pdb.set_trace() from scipy.sparse import hstack X_train = X_transformed_train X_test = X_transformed_test y_train = self.stumble_data.all_pd[:self.stumble_data. len_train]['label'] # print 'Dumping train in SVMLight.' dump_svmlight_file(X_train, y_train, output_train_libsvm_file) # print 'Dumping test in SVMLight.' # dump_svmlight_file(X_test, pred, output_test_libsvm_file ) else: X_train = X_train.as_matrix() X_test = X_test.as_matrix() return X_train, y_train, X_test
class Preprocessor: mapper: DataFrameMapper def __init__(self): self.mapper = DataFrameMapper([(encoding_fields, [ SimpleImputer(strategy="most_frequent"), preprocessing.OrdinalEncoder() ]), (scale_fields, preprocessing.StandardScaler())]) def train(self, x: pd.DataFrame): self.mapper.fit(x) def transform(self, x: pd.DataFrame): return self.mapper.transform(x)
def transform_features(self): totransform = [] for index, item in enumerate(self.feat_head): field = item[0] func_name = item[1] transform = item[2] is_enable = item[3] if is_enable: if not field in self.stumble_data.get_features(): print 'field not in feature..generating:' + field func_name(field) totransform.append((field, transform)) if len(totransform): mapper = DataFrameMapper(totransform) mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train]) # X_transformed_train = mapper.transform( self.stumble_data.all_pd[:self.stumble_data.len_train]) X_transformed_test = mapper.transform( self.stumble_data.all_pd[self.stumble_data.len_train:]) for index, item in enumerate(self.feat_head): field = item[0] is_enable = item[3] if is_enable and field in self.stumble_data.get_features(): del self.stumble_data.all_pd[field] import pdb pdb.set_trace() from scipy.sparse import hstack X_train = X_transformed_train X_test = X_transformed_test y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label'] # print 'Dumping train in SVMLight.' dump_svmlight_file(X_train, y_train, output_train_libsvm_file ) # print 'Dumping test in SVMLight.' # dump_svmlight_file(X_test, pred, output_test_libsvm_file ) else: X_train = X_train.as_matrix() X_test = X_test.as_matrix() return X_train, y_train, X_test
def test_build_features_old_unpickle(simple_dataframe): """ Fitted mappers pickled before the built_features and built_default attributes can correctly transform """ df = simple_dataframe mapper = DataFrameMapper([('a', None)]) mapper.fit(df) # simulate the mapper was pickled before the attributes existed del mapper.built_features del mapper.built_default mapper_pickled = pickle.dumps(mapper) loaded_mapper = pickle.loads(mapper_pickled) loaded_mapper.transform(simple_dataframe) # doesn't fail
def test_build_features_old_unpickle(simple_dataframe): """ Fitted mappers pickled before the built_features and built_default attributes can correctly transform """ df = simple_dataframe mapper = DataFrameMapper([('a', None)]) mapper.fit(df) # simulate the mapper was pickled before the attributes existed del mapper.built_features del mapper.built_default mapper_pickled = pickle.dumps(mapper) loaded_mapper = pickle.loads(mapper_pickled) loaded_mapper.transform(simple_dataframe) # doesn't fail
def train_fn(args): print("loading data") train_df = pd.read_csv(args.train_data + "/train.csv", engine='python') test_df = pd.read_csv(args.test_data + "/test.csv", engine='python') TARGET = 'SeriousDlqin2yrs' X_train = train_df.drop(TARGET, axis=1) y_train = train_df[TARGET] X_test = test_df.drop(TARGET, axis=1) y_test = test_df[TARGET] print("Imputing missing values") transformer = DataFrameMapper([ (['MonthlyIncome'], DFImputer(strategy="constant", fill_value=-1)), (['age'], DFImputer(strategy="median")), (['NumberOfDependents'], DFImputer(strategy="median")), (['DebtRatio'], DFImputer(strategy="median")), (['RevolvingUtilizationOfUnsecuredLines' ], DFImputer(strategy="median")), (['NumberRealEstateLoansOrLines'], DFImputer(strategy="median")), (['NumberOfOpenCreditLinesAndLoans'], DFImputer(strategy="median")), (['NumberOfTime30-59DaysPastDueNotWorse' ], DFImputer(strategy="median")), (['NumberOfTime60-89DaysPastDueNotWorse' ], DFImputer(strategy="median")), (['NumberOfTimes90DaysLate'], DFImputer(strategy="median")), ], input_df=True, df_out=True) transformer.fit(X_train) X_train = transformer.transform(X_train) X_test = transformer.transform(X_test) print("Building model...") model = RandomForestClassifier(n_estimators=50, max_depth=6, max_leaf_nodes=30) model.fit(X_train, y_train) explainer = shap.TreeExplainer(model) print("Saving artifacts...") model_dir = Path(args.model_dir) model_dir.mkdir(exist_ok=True, parents=True) joblib.dump(transformer, open(str(model_dir / "transformer.joblib"), "wb")) joblib.dump(model, open(str(model_dir / "model.joblib"), "wb")) joblib.dump(explainer, open(str(model_dir / "explainer.joblib"), "wb"))
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch): """ If input_df is True, the subsequent transformers get passed pandas objects instead of numpy arrays (given the previous transformers output pandas objects as well) """ df = simple_dataframe monkeypatch.setattr(MockTClassifier, 'fit', Mock()) monkeypatch.setattr(MockTClassifier, 'transform', Mock(return_value=pd.Series([1, 2, 3]))) mapper = DataFrameMapper( [('a', [MockXTransformer(), MockTClassifier()])], input_df=True) mapper.fit(df) out = mapper.transform(df) args, _ = MockTClassifier().fit.call_args assert isinstance(args[0], pd.Series) assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
def test_fit_transform_equiv_mock(simple_dataframe): """ Check for equivalent results for code paths fit_transform versus fit and transform in DataFrameMapper using the mock transformer which does not implement a custom fit_transform. """ df = simple_dataframe mapper = DataFrameMapper([('a', MockXTransformer())]) transformed_combined = mapper.fit_transform(df) transformed_separate = mapper.fit(df).transform(df) assert np.all(transformed_combined == transformed_separate)
def test_fit_transform_equiv_mock(simple_dataframe): """ Check for equivalent results for code paths fit_transform versus fit and transform in DataFrameMapper using the mock transformer which does not implement a custom fit_transform. """ df = simple_dataframe mapper = DataFrameMapper([('a', MockXTransformer())]) transformed_combined = mapper.fit_transform(df) transformed_separate = mapper.fit(df).transform(df) assert np.all(transformed_combined == transformed_separate)
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch): """ If input_df is True, the subsequent transformers get passed pandas objects instead of numpy arrays (given the previous transformers output pandas objects as well) """ df = simple_dataframe monkeypatch.setattr(MockTClassifier, 'fit', Mock()) monkeypatch.setattr(MockTClassifier, 'transform', Mock(return_value=pd.Series([1, 2, 3]))) mapper = DataFrameMapper([ ('a', [MockXTransformer(), MockTClassifier()]) ], input_df=True) mapper.fit(df) out = mapper.transform(df) args, _ = MockTClassifier().fit.call_args assert isinstance(args[0], pd.Series) assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
def encode_cat_var(df, cat_var_dict): """Encode categorical features : return the fit """ cat_vars = [ o[0] for o in sorted( cat_var_dict.items(), key=operator.itemgetter(1), reverse=True) ] cat_maps = [(o, LabelEncoder()) for o in cat_vars] cat_mapper = DataFrameMapper(cat_maps) cat_map_fit = cat_mapper.fit(df) return cat_map_fit
def extract_features(dataset1, vect, prom_filename1, prom_filename2): #Adding 3 more features tf-idf_vector and relevance dataset1['relevance'] = np.nan dataset1['relevance_quesn'] = np.nan dataset1['word_count'] = np.nan #Adding tf-idf vectors as features #currently set to [essay] , to be transformed using mapper #making a list since tf-idf needs an iterable over text dataset1['tf-idf'] = dataset1['essay'] tokenizer_words = TweetTokenizer() dataset1['word_count'] = list(map(len,dataset1['essay'].apply(tokenizer_words.tokenize))) dataset1['distinct_word_count'] = list(map(len,list(map(set,dataset1['essay'].apply(tokenizer_words.tokenize))))) dataset1['sentence_count'] = list(map(len,dataset1['essay'].apply(nltk.sent_tokenize))) dataset1['charslength'] = list(map(len,list(map("".join,dataset1['essay'].apply(tokenizer_words.tokenize))))) dataset1['avg_word_length'] = np.divide(dataset1['charslength'],dataset1['word_count']) #Calculating similarity between prompt and essay and adding a feature relevance prm1_vect, prm1_quesn_vect = extract_prompt(prom_filename1, prom_filename2,vect) essay_vect = vect.transform(dataset1['essay']) sim1 = cosine_similarity(essay_vect, prm1_vect) #don't compute element wise , broadcasting reduces time sim2 = cosine_similarity(essay_vect, prm1_quesn_vect) dataset1['relevance']= sim1 dataset1['relevance_quesn'] = sim2 #note- Dataframemapper gives nonetype not iterable error if is not fitted to data. #It is necessary to fit mapper before transforming, otherwise it recieves no data and #hence the nonetype error. from sklearn_pandas import DataFrameMapper mapper = DataFrameMapper([ # ('tf-idf', Vectorize(vect),{'input_df': True}), #transforming tf-idf using vect ('relevance', None), #no transformation required ('relevance_quesn',None), #no transformation required ('essay_set',None), ('word_count',None), ('distinct_word_count',None), ('sentence_count', None), ('avg_word_length',None), #if we include essay then it would cause error in final stacking of colums as #tf-idf is sparse and text cannot be converted to sparse format #Also essay text is not required ]) #creating features combinining tf-idf and relevance features mapp = mapper.fit(dataset1) #applies individual column transformation as defined in mapper features = mapp.transform(dataset1) return features
def pre_processing(self): self.__numeric_header = [i for i in self.__train_feature.columns if i not in self.__categorical_header] self.__train_categorical = self.__train_feature[self.__categorical_header] self.__train_numeric = self.__train_feature[self.__numeric_header] self.__test_categorical = self.__test_feature[self.__categorical_header] self.__test_numeric = self.__test_feature[self.__numeric_header] self.__train_categorical = self.__train_categorical.astype(str) self.__test_categorical = self.__test_categorical.astype(str) self.__train_categorical = self.__train_categorical.fillna("missing") self.__test_categorical = self.__test_categorical.fillna("missing") mapper = DataFrameMapper([(i, LabelEncoder()) for i in self.__train_categorical.columns]) mapper.fit(self.__train_categorical) self.__train_categorical = pd.DataFrame(mapper.transform(self.__train_categorical), columns=self.__train_categorical.columns) self.__test_categorical = pd.DataFrame(mapper.transform(self.__test_categorical), columns=self.__test_categorical.columns) self.__train_numeric = self.__train_numeric.fillna(-999) self.__test_numeric = self.__test_numeric.fillna(-999) self.__train_feature = pd.concat([self.__train_numeric, self.__train_categorical], axis=1) self.__test_feature = pd.concat([self.__test_numeric, self.__test_categorical], axis=1) self.__train_feature = self.__train_feature.values self.__test_feature = self.__test_feature.values
class MyMapper(): def __init__(self): pass def fit(self, X, y=None): self.ncols = [] self.scols = [] # print("mapping features") for col in X: if X[col].dtype == float: # print("numerical col: %s" % col) self.ncols.append([col]) else: # print("categorical col: %s" % col) self.scols.append([col]) nfeats = gen_features(columns=self.ncols, classes=[{ 'class': sklearn.preprocessing.MinMaxScaler, }]) sfeats = gen_features(columns=self.scols, classes=[{ 'class': LabelBinarizer2 }]) self.mapper = DataFrameMapper(nfeats + sfeats, df_out=True) self.mapper.fit(X) # print("features mapped") return self def transform(self, X, y=None): X = X.copy() X = self.mapper.transform(X) return X def fit_transform(self, X, y=None): self.fit(X) return self.transform(X)
def type_map(data_df, cat_vars, contin_vars, cache_dir, use_cache=True): " type_map " if use_cache: cat_map_fit = pickle.load( open('{}/cat_maps.pickle'.format(cache_dir), 'rb')) contin_map_fit = pickle.load( open('{}/contin_maps.pickle'.format(cache_dir), 'rb')) else: cat_maps = [(o, LabelEncoder()) for o in cat_vars] contin_maps = [([o], StandardScaler()) for o in contin_vars] cat_mapper = DataFrameMapper(cat_maps) cat_map_fit = cat_mapper.fit(data_df) contin_mapper = DataFrameMapper(contin_maps) contin_map_fit = contin_mapper.fit(data_df) pickle.dump(contin_map_fit, open('{}/contin_maps.pickle'.format(cache_dir), 'wb')) pickle.dump(cat_map_fit, open('{}/cat_maps.pickle'.format(cache_dir), 'wb')) cat_cols = len(cat_map_fit.features) contin_cols = len(contin_map_fit.features) logger.info("cat_cols: {}, contin_cols: {}".format(cat_cols, contin_cols)) logger.info("cat_map_fit.features: {}".format( [len(o[1].classes_) for o in cat_map_fit.features])) return cat_map_fit, contin_map_fit, contin_cols
def test_fit_transform_equiv_pca(complex_dataframe): """ Check for equivalent results for code paths fit_transform versus fit and transform in DataFrameMapper and transformer using PCA which implements a custom fit_transform. The equivalence of both paths in the transformer only can be asserted since this is tested in the sklearn tests scikit-learn/sklearn/decomposition/tests/test_pca.py """ df = complex_dataframe mapper = DataFrameMapper( [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], df_out=True) transformed_combined = mapper.fit_transform(df) transformed_separate = mapper.fit(df).transform(df) assert np.allclose(transformed_combined, transformed_separate)
def test_fit_transform_equiv_pca(complex_dataframe): """ Check for equivalent results for code paths fit_transform versus fit and transform in DataFrameMapper and transformer using PCA which implements a custom fit_transform. The equivalence of both paths in the transformer only can be asserted since this is tested in the sklearn tests scikit-learn/sklearn/decomposition/tests/test_pca.py """ df = complex_dataframe mapper = DataFrameMapper( [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], df_out=True) transformed_combined = mapper.fit_transform(df) transformed_separate = mapper.fit(df).transform(df) assert np.allclose(transformed_combined, transformed_separate)
def test_fit_with_required_y_arg(complex_dataframe): """ Transformers with a required y argument in the fit method are handled and perform correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))]) # fit, doesn't fail ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target']) # fit_transform ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target']) assert_array_equal(ft_arr, df[['feat1']].values) # transform t_arr = mapper.transform(df[['feat1', 'feat2']]) assert_array_equal(t_arr, df[['feat1']].values)
def test_fit_with_required_y_arg(complex_dataframe): """ Transformers with a required y argument in the fit method are handled and perform correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))]) # fit, doesn't fail ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target']) # fit_transform ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target']) assert_array_equal(ft_arr, df[['feat1']].values) # transform t_arr = mapper.transform(df[['feat1', 'feat2']]) assert_array_equal(t_arr, df[['feat1']].values)
def test_make_column_selector(iris_dataframe): t = DataFrameMapper([ (make_column_selector(dtype_include=float), None, { 'alias': 'x' }), ('sepal length (cm)', None), ], df_out=True, default=False) xt = t.fit(iris_dataframe).transform(iris_dataframe) expected = ['x_0', 'x_1', 'x_2', 'x_3', 'sepal length (cm)'] assert list(xt.columns) == expected pickled = pickle.dumps(t) t2 = pickle.loads(pickled) xt2 = t2.transform(iris_dataframe) assert np.array_equal(xt.values, xt2.values)
def Standard_(self, dfa, scale = 'S'): Scalers = { 'S' : StandardScaler(), 'R' : RobustScaler(quantile_range=tuple(self.arg.QuantileRange)), 'M' : MinMaxScaler(), 'MA': MaxAbsScaler(), 'OE': OrdinalEncoder(), 'OH': OneHotEncoder(), 'NL' : Normalizer(), 'QT': QuantileTransformer(), 'PT': PowerTransformer(), 'N' : FunctionTransformer( validate=False ), } Sca_map = [Scalers[i] for i in scale] Xa = list( dfa.columns ) mapper = DataFrameMapper([ ( Xa, Sca_map ) ]) clfit = mapper.fit( dfa ) self.log.CIF('Standardization Pocessing'.center(45, '-')) self.log.NIF('Scale paramaters:\n%s' %clfit) self.log.CIF(45 * '-') return clfit
def create_mapper(df, cat_vars=list(), cont_vars=list(), date_vars=list(), no_transform_vars=list(), response_vars=list()): logging.info('Creating mapper') # TODO Add support for datetime variables # Reference variables transformation_list = list() # Copy df, to avoid 'inplace' transformation df = df.copy(deep=True) # TODO Check if any df variables are not listed in cat_vars or cont_vars. If so, remove them. # Check if any df variables are listed in cat_vars and cont_vars. If so, raise an error. intersection = filter(lambda x: x in cat_vars, cont_vars) if len(intersection) > 0: raise AssertionError('Columns appear in both cat_vars and cont_vars: {}'.format(intersection)) # Convert continuous variables to float32 for cont_var in cont_vars + response_vars: logging.debug('Converting cont_var data type: {}'.format(cont_var)) df[cont_var] = df[cont_var].astype(numpy.float32) for date_var in date_vars: logging.info('Enriching for datetime var: {}'.format(date_var)) df, date_cat_vars, date_cont_vars = add_datetime_vars(df, date_var) cat_vars.extend(date_cat_vars) cont_vars.extend(date_cont_vars) # Add continuous variable transformations for cont_vars for cont_var in cont_vars + response_vars: logging.debug('Creating transformation list for cont_var: {}'.format(cont_var)) transformations = [Imputer(strategy='mean'), StandardScaler()] var_tuple = ([cont_var], transformations) transformation_list.append(var_tuple) # Add categorical variable transformations for cat_vars for cat_var in cat_vars: logging.debug('Creating transformation list for cat_var: {}'.format(cat_var)) # TODO Replace LabelEncoder with CategoricalEncoder, to better handle unseen cases transformations = [LabelEncoder()] var_tuple = (cat_var, transformations) transformation_list.append(var_tuple) for no_transform_var in no_transform_vars: logging.debug('Creating transformation list for cont_var: {}'.format(no_transform_var)) transformations = [Imputer(strategy='most_frequent')] var_tuple = ([no_transform_var], transformations) transformation_list.append(var_tuple) # Create mapper logging.info('Creating mapper') mapper = DataFrameMapper(features=transformation_list, df_out=True) # Train mapper logging.info('Training newly created mapper') mapper.fit(df) # Throw away transformation, to set up mapper logging.info('Transforming data set with newly created mapper, to initialize mapper internals') mapper.transform(df.sample(1000)) return mapper
(["CompetitionOpen"], LabelBinarizer()), (["WillClosedTomorrow_TodayIsSat"], LabelBinarizer()), (["WillClosedTomorrow_TodayIsNotSat"], LabelBinarizer()), (["WasClosedYesterday_TodayIsMon"], LabelBinarizer()), (["WasClosedYesterday_TodayIsNotMon"], LabelBinarizer()), (["SchoolHoliday"], LabelBinarizer()), (["StateHoliday"], OneHotEncoder()), (["Year"], OneHotEncoder()), (["Month"], OneHotEncoder()), (["Tenday"], OneHotEncoder()), (["Day"], OneHotEncoder()), (["DayStr"], OneHotEncoder()), (["DayOfWeek"], OneHotEncoder()), (["WeekOfYearStr"], OneHotEncoder()), (["DayOfYearOutlier"], OneHotEncoder()), (["DayOfYearSlopeStr"], OneHotEncoder()) ], default=False) mapper_fit = mapper.fit(df_train) df_train_transform = mapper_fit.transform(df_train) df_valid_transform = mapper_fit.transform(df_valid) df_test_transform = mapper_fit.transform(df_test) x_train = df_train_transform[:, 1:] y_train = df_train_transform[:, 0] y_train = np.log1p(y_train) x_valid = df_valid_transform[:, 1:] y_valid = df_valid_transform[:, 0] y_valid = np.log1p(y_valid) x_test = df_test_transform[:, 1:] # 模型训练、验证、测试
def get_mapper(data_all): param_list = [ ('id', None), ('major', LabelEncoder()), ('age', None), ('gender', LabelEncoder()), ('isenglish', None), ('isjunior', None), ('isbachelor', None), ('ismaster', None), ('isintern', None), ('total_previous_job', None), ('last_type', LabelEncoder()), ('last_type1', LabelEncoder()), ('last_department', LabelEncoder()), ('last_size', None), ('last_salary', None), ('last_industry', LabelEncoder()), ('last_position_name', LabelEncoder()), ('last_start_year', None), ('last_start_month', None), ('last_end_year', None), ('last_end_month', None), ('last_interval_month', None), ('third_type', LabelEncoder()), ('third_type1', LabelEncoder()), ('third_department', LabelEncoder()), ('third_size', None), ('third_salary', None), ('third_industry', LabelEncoder()), ('third_position_name', LabelEncoder()), ('third_start_year', None), ('third_start_month', None), ('third_end_year', None), ('third_end_month', None), ('third_interval_month', None), ('first_type', LabelEncoder()), ('first_type1', LabelEncoder()), ('first_department', LabelEncoder()), ('first_size', None), ('first_salary', None), ('first_industry', LabelEncoder()), ('first_position_name', LabelEncoder()), ('first_start_year', None), ('first_start_month', None), ('first_end_year', None), ('first_end_month', None), ('first_interval_month', None), ('last3_interval_month', None), ('diff_last3_salary', LabelEncoder()), ('diff_last3_size', LabelEncoder()), ('diff_last3_industry', LabelEncoder()), ('diff_last3_position_name', LabelEncoder()), ('total_interval_month', None), ('diff_salary', LabelEncoder()), ('diff_size', LabelEncoder()), ('diff_industry', LabelEncoder()), ('diff_position_name', LabelEncoder()), ('major_1', LabelEncoder()), ('last_position_name_1', LabelEncoder()), ('last_department_1', LabelEncoder()), ('third_position_name_1', LabelEncoder()), ('third_department_1', LabelEncoder()), ('first_position_name_1', LabelEncoder()), ('first_department_1', LabelEncoder()), ('major_2', LabelEncoder()), ('last_position_name_2', LabelEncoder()), ('last_department_2', LabelEncoder()), ('third_position_name_2', LabelEncoder()), ('third_department_2', LabelEncoder()), ('first_position_name_2', LabelEncoder()), ('first_department_2', LabelEncoder()), ('start_working_age', None), ('rev_working_age', None), ('pre_working_month', None), ('pre_interval_month', None), ("pre_largest_size", None), ("pre_largest_salary", None), ("pre_least_size", None), ("pre_least_salary", None), ("pre_size1", None), ("pre_size2", None), ("pre_size3", None), ("pre_size4", None), ("pre_size5", None), ("pre_size6", None), ("pre_size7", None), ("pre_salary1", None), ("pre_salary2", None), ("pre_salary3", None), ("pre_salary4", None), ("pre_salary5", None), ("pre_salary6", None), ("pre_salary7", None), ("promotion_size", None), ("promotion_salary", None), ("decrease_size", None), ("decrease_salar", None) ] print "the mapper's param list is %s" % (len(param_list)) mapper = DataFrameMapper(param_list) mapper.fit(data_all) return mapper
def __load_coupons(self, validation_timedelta): train_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_train.csv"), parse_dates=["DISPFROM","DISPEND"]) test_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_test.csv")) train_coupon_df["DISPFROM"].fillna(pd.Timestamp("19000101"), inplace=True) train_coupon_df = train_coupon_df.sort(columns=["DISPFROM"]).reset_index(drop=True) if validation_timedelta: max_date = train_coupon_df["DISPFROM"].max() valid_start = max_date - validation_timedelta valid_coupon_df = train_coupon_df[(train_coupon_df["DISPFROM"] > valid_start)] train_coupon_df = train_coupon_df[~ (train_coupon_df["DISPFROM"] > valid_start)] else: valid_coupon_df = train_coupon_df[np.zeros(len(train_coupon_df), dtype=np.bool)].copy() # remove outlier data from the validation-set if len(valid_coupon_df) > 0: very_low_price = valid_coupon_df[valid_coupon_df.DISCOUNT_PRICE <= 100].COUPON_ID_hash very_long_time_display = valid_coupon_df[valid_coupon_df.DISPPERIOD > 20].COUPON_ID_hash valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_long_time_display)] valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_low_price)].reset_index(drop=True) # remove outlier data from the training-set very_long_time_display = train_coupon_df[train_coupon_df.DISPPERIOD > 20].COUPON_ID_hash train_coupon_df = train_coupon_df[~train_coupon_df.COUPON_ID_hash.isin(very_long_time_display)].reset_index(drop=True) # coupon features coupon_mapper = DataFrameMapper([ ('CATEGORY_NAME', LabelBinarizer()), ('PRICE_RATE', None), ('CATALOG_PRICE_LOG', None), ('DISCOUNT_PRICE_LOG', None), ('REDUCE_PRICE_LOG', None), ('DISPPERIOD_C', LabelBinarizer()), ('VALIDPERIOD_NA', LabelBinarizer()), ('USABLE_DATE_SUM', None), ('LARGE_AREA_NAME', LabelBinarizer()), ('PREF_NAME', LabelBinarizer()), ('SMALL_AREA_NAME', LabelBinarizer()), ]) config = {} self.__coupon_preproc(train_coupon_df) self.__coupon_preproc(valid_coupon_df) self.__coupon_preproc(test_coupon_df) coupon_mapper.fit(pd.concat([train_coupon_df, valid_coupon_df, test_coupon_df])) train_coupon_vec = coupon_mapper.transform(train_coupon_df.copy()) if len(valid_coupon_df) > 0: valid_coupon_vec = coupon_mapper.transform(valid_coupon_df.copy()) else: valid_coupon_vec = np.array([]) test_coupon_vec = coupon_mapper.transform(test_coupon_df.copy()) self.train_coupon_vec = train_coupon_vec self.valid_coupon_vec = valid_coupon_vec self.test_coupon_vec = test_coupon_vec self.train_coupon_df = train_coupon_df self.valid_coupon_df = valid_coupon_df self.test_coupon_df = test_coupon_df
class Dataprocess(object): datafile = "data.csv" def __init__(self, datadir="/Users/shintaro/work/kaggle-kobe/data/"): self.datadir = datadir def read(self): self.df_orig = pd.read_csv(self.datadir + self.datafile) self.df = self.df_orig.copy() def process(self): self.read() self.preproc() self.set_mapper() self.split_df() train_X = self.vec_X(self.train_df) train_y = self.vec_y(self.train_df) test_X = self.mapper_X.transform(self.test_df) return train_X, train_y, test_X def preproc(self): self.df["time_remaining"] = self.df["minutes_remaining"] * 60 + self.df["seconds_remaining"] self.df['last_5_sec'] = self.df['time_remaining'] < 5 self.df['latter_half'] = self.df['time_remaining'] < 360 self.df['first_period'] = self.df['period'] == 1 self.df['latter_period'] = self.df['period'] > 2 self.df['last_period'] = self.df['period'] == 4 self.df['last_quarter'] = self.df['time_remaining'] < 180 threshold = 3 anomaly = 14 self.df['last_moment'] = self.df.apply(lambda row: row['time_remaining'] < threshold or row['time_remaining'] == anomaly, axis=1) self.df['away'] = self.df.matchup.str.contains('@') self.df['secondsFromStart'] = 60 * (11 - self.df['minutes_remaining']) + (60 - self.df['seconds_remaining']) self.df['secondsFromGameStart'] = (self.df['period'] <= 4).astype(int) * (self.df['period'] - 1) * 12 * 60 + (self.df['period'] > 4).astype(int) * ((self.df['period'] - 4) * 5 * 60 + 3 * 12 * 60) + self.df['secondsFromStart'] numGaussians = 13 gaussianMixtureModel = mixture.GMM(n_components=numGaussians, covariance_type='full', params='wmc', init_params='wmc', random_state=1, n_init=3, verbose=0) gaussianMixtureModel.fit(self.df.ix[:,['loc_x','loc_y']]) self.df['shotLocationCluster'] = gaussianMixtureModel.predict(self.df.ix[:,['loc_x','loc_y']]) self.df['homeGame'] = self.df['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0) self.df["game_year"] = pd.Series([int(self.df["game_date"][i][:4]) for i in range(0, len(self.df))]) self.df["game_month"] = pd.Series([int(self.df["game_date"][i][5:7]) for i in range(0, len(self.df))]) self.df["game_day"] = pd.Series([int(self.df["game_date"][i][-2:]) for i in range(0, len(self.df))]) action_type_list = list(set(self.df["action_type"].tolist())) self.df["action_type_num"] = pd.Series([action_type_list.index(self.df["action_type"][i]) for i in range(0, len(self.df))]) combined_shot_type_list = list(set(self.df["combined_shot_type"].tolist())) self.df["combined_shot_type_num"] = pd.Series([combined_shot_type_list.index(self.df["combined_shot_type"][i]) for i in range(0, len(self.df))]) opponent_list = list(set(self.df["opponent"].tolist())) self.df["opponent_num"] = pd.Series([opponent_list.index(self.df["opponent"][i]) for i in range(0, len(self.df))]) game_id_list = list(set(self.df["game_id"].tolist())) self.df["game_id_num"] = pd.Series([game_id_list.index(self.df["game_id"][i]) for i in range(0, len(self.df))]) season_list = list(set(self.df["season"].tolist())) season_list.sort() self.df["season_num"] = pd.Series([season_list.index(self.df["season"][i]) for i in range(0, len(self.df))]) self.df["shot_distance"][self.df["shot_distance"] > 45] = 45 # del self.df["team_id"], self.df["team_name"], self.df["game_event_id"], self.df["lat"], self.df["lon"] # return self.df def set_mapper(self): self.mapper_X = DataFrameMapper([ (u'action_type', LabelBinarizer()), (u'combined_shot_type', LabelBinarizer()), (u'loc_x', None), (u'loc_y', None), (u'minutes_remaining', None), (u'period', LabelBinarizer()), (u'playoffs', LabelBinarizer()), (u'season', LabelBinarizer()), (u'seconds_remaining', None), (u'shot_distance', None), (u'shot_type', LabelBinarizer()), (u'shot_zone_area', LabelBinarizer()), (u'shot_zone_basic', LabelBinarizer()), (u'shot_zone_range', LabelBinarizer()), (u'matchup', LabelBinarizer()), (u'shot_id', None), (u'season_num', None), (u'game_year', None), (u'game_month', None), (u'game_day', None), (u'first_period', LabelBinarizer()), (u'latter_period', LabelBinarizer()), (u'last_period', LabelBinarizer()), (u'last_quarter', LabelBinarizer()), (u'time_remaining', None), (u'latter_half', LabelBinarizer()), (u'last_5_sec', LabelBinarizer()), (u'opponent_num', LabelBinarizer()), (u'game_id_num', LabelBinarizer()), (u'last_moment', LabelBinarizer()), (u'away', LabelBinarizer()), (u'secondsFromStart', None), (u'secondsFromGameStart', None), (u'shotLocationCluster', LabelBinarizer()), (u'homeGame', LabelBinarizer()), ]) self.mapper_y = DataFrameMapper([(u'shot_made_flag', None),]) self.mapper_X.fit(self.df) self.mapper_y.fit(self.df) def split_df(self): self.train_df = self.df[~np.isnan(self.df["shot_made_flag"])] self.test_df = self.df[np.isnan(self.df["shot_made_flag"])] def vec_X(self, df): return self.mapper_X.transform(df.copy()) def vec_y(self, df): return self.mapper_y.transform(df.copy())
y_test=test.Gap_cnt.fillna(0) x=train[feasible_columns].drop(columns_to_drop, axis=1).fillna(0) x_test=test[feasible_columns].drop(columns_to_drop, axis=1).fillna(0) x_result=predict[feasible_columns].drop(columns_to_drop, axis=1).fillna(0) x_result=x_result.loc[[i in [46,58,70,82,94,106,118,130,142] for i in x_result.time_id],:] x_final=x_result.reset_index(drop=True) a=x.columns mapper=[] for j in a: if j in ['district_id', 'Day', 'Weekday', 'Workday', 'Yesterday_Workday','Twoday_ago_Workday', 'time_id']: mapper.append((j,None)) else: mapper.append((j,StandardScaler())) b=DataFrameMapper(mapper) b.fit(pd.concat([x, x_test, x_result])) x=b.transform(x) x_test=b.transform(x_test) x_result_before = x_result x_result=b.transform(x_result) #Random Forest clf = ensemble.RandomForestClassifier(n_estimators=20,max_features=min(len(feasible_columns) - len(columns_to_drop), 25)) clf.fit(x,y) clf_predict=clf.predict(x_test) clf_score=clf.score(x_test, y_test) clf_predict.fill(1) diff=clf_predict-y_test