def test_list_transformers_single_arg(simple_dataframe): """ Multiple transformers can be specified in a list even if some of them only accept one X argument instead of two (X, y). """ mapper = DataFrameMapper([("a", [MockXTransformer()])]) # doesn't fail mapper.fit_transform(simple_dataframe)
def test_mapper(self): domain = CategoricalDomain() df = DataFrame([{"X" : "2", "y" : 2}, {"X" : "1"}, {"X" : "3"}]) mapper = DataFrameMapper([ ("X", [domain, LabelBinarizer()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
def test_transformed_names_simple(simple_dataframe): """ Get transformed names of features in `transformed_names` attribute for simple transformation """ df = simple_dataframe mapper = DataFrameMapper([('a', None)]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['a']
def test_default_none_names(): """ If default=None, column names are returned unmodified. """ df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) mapper = DataFrameMapper([], default=None) mapper.fit_transform(df) assert mapper.transformed_names_ == ['a', 'b']
def test_transformed_names_simple_alias(simple_dataframe): """ If we specify an alias for a single output column, it is used for the output """ df = simple_dataframe mapper = DataFrameMapper([('a', None, {'alias': 'new_name'})]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['new_name']
def test_transformed_names_complex_alias(complex_dataframe): """ If we specify an alias for a multiple output column, it is used for the output """ df = complex_dataframe mapper = DataFrameMapper([('target', LabelBinarizer(), {'alias': 'new'})]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['new_a', 'new_b', 'new_c']
def test_transformed_names_binarizer(complex_dataframe): """ Get transformed names of features in `transformed_names` attribute for a transformation that multiplies the number of columns """ df = complex_dataframe mapper = DataFrameMapper([('target', LabelBinarizer())]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
def test_mapper(self): domain = ContinuousDomain() df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 3.0, "X2" : 3.5}]) mapper = DataFrameMapper([ (["X1", "X2"], [domain, StandardScaler()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual(numpy.array([1.0, 0.5]).tolist(), domain.data_min_.tolist()) self.assertEqual(numpy.array([3.0, 3.5]).tolist(), domain.data_max_.tolist())
def test_fit_transform(simple_dataframe): """ Check that custom fit_transform methods of the transformers are invoked. """ df = simple_dataframe mock_transformer = Mock() # return something of measurable length but does nothing mock_transformer.fit_transform.return_value = np.array([1, 2, 3]) mapper = DataFrameMapper([("a", mock_transformer)]) mapper.fit_transform(df) assert mock_transformer.fit_transform.called
def test_transformed_names_transformers_list(complex_dataframe): """ When using a list of transformers, use them in inverse order to get the transformed names """ df = complex_dataframe mapper = DataFrameMapper([ ('target', [LabelBinarizer(), MockXTransformer()]) ]) mapper.fit_transform(df) assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
def test_cols_string_array(simple_dataframe): """ If an string specified as the columns, the transformer is called with a 1-d array as input. """ df = simple_dataframe mock_transformer = Mock() mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing mapper = DataFrameMapper([("a", mock_transformer)]) mapper.fit_transform(df) args, kwargs = mock_transformer.fit.call_args assert args[0].shape == (3,)
def test_cols_list_column_vector(simple_dataframe): """ If a one-element list is specified as the columns, the transformer is called with a column vector as input. """ df = simple_dataframe mock_transformer = Mock() mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing mapper = DataFrameMapper([(["a"], mock_transformer)]) mapper.fit_transform(df) args, kwargs = mock_transformer.fit.call_args assert args[0].shape == (3, 1)
def compute_cross_correlation_score(df, clfs, preprocess_scaling=True, nFold=10): """ return an iterator with cross validation data :param df: :param clfs: :param preprocess_scaling: :param nFold: :return: """ to_sklearn_features = DataFrameMapper([('features', sklearn.feature_extraction.DictVectorizer())]) data_X = to_sklearn_features.fit_transform(df) data_Y = df.expected_class skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold) classification_results = [] scores = [] for num, (train_index, test_index) in enumerate(skf): X_train, X_test = data_X[train_index], data_X[test_index] Y_train, Y_test = data_Y[train_index], data_Y[test_index] print("Len train{}, Len test{}".format(Y_train.size, Y_test.size)) cross_valid_data = Cross_validation_split(X_train, X_test, Y_train, Y_test) cross_valid_data = preprocess(cross_valid_data, preprocess_scaling=preprocess_scaling, preprocess_correlation=False) for clf in clfs: score, classification = generate_score(clf, cross_valid_data, fold=num) scores.append(score) classification_results.append(classification) return scores, classification_results
def preprocess_train(train): train_y = train['count'] train_y1 = train['casual'] train_y2 = train['registered'] preprocess_data(train) mapper = DataFrameMapper([ ('hour', None), ('season', preprocessing.LabelBinarizer()), ('holiday', None), ('workingday', None), ('weather', preprocessing.LabelBinarizer()), ('temp', None), ('atemp', None), ('humidity', None), ('windspeed', None), ('weekday', None), ('is_sunday', None), ('bad_weather', None), ('year', None), ]) train_X = mapper.fit_transform(train) return train_X, train_y, train_y1, train_y2, mapper
class Transformer(object): """ The purpose of this class is to take a dataframe and transform it into a numpy array compatible format. """ def __init__(self, config): self.__config = config self.__mapper = None self.__label_encoder_adapter = TransformerAdapter(LabelEncoderMissingValuesTransformer()) def prepare(self, dataframe): """ Takes the already cleaned dataframe, splits it into train and test and returns the train and test as numpy arrays. If the problem is supervised, the target column will be that last one of the returned arrays. """ mapping = DataFrameMapCreator().get_mapping_from_config(self.__config) self.__mapper = DataFrameMapper(mapping) train, test = split_dataframe_train_test(dataframe, self.__config.get_option_parameter("split", "train_percentage")) return self.__get_correct_return_parameters(train, test) def __get_correct_return_parameters(self, train, test): model = self.__config.get_data_model() train_transformed = self.__mapper.fit_transform(train) test_transformed = self.__mapper.transform(test) if model.has_target(): return self.__add_target_data(train_transformed, train), \ self.__add_target_data(test_transformed, test) else: return train_transformed, test_transformed def __add_target_data(self, transformed_data, original_data): """ Picks up the target data from the original_data and appends it as a column to the transformed_data. Both arguments are expected to be np.array's. """ model = self.__config.get_data_model() target_feature = model.find_target_feature() name = target_feature.get_name() if target_feature.is_categorical(): target_row = original_data[name] target = self.__label_encoder_adapter.transform(target_row) else: target = original_data[name].values.astype(type_name_to_data_type("float")) target = target[..., None] return np.hstack((transformed_data, target)) def apply(self, dataframe): return self.__mapper.transform(dataframe)
def test_simple_df(simple_dataframe): """ Get a dataframe from a simple mapped dataframe """ df = simple_dataframe mapper = DataFrameMapper([('a', None)], df_out=True) transformed = mapper.fit_transform(df) assert type(transformed) == pd.DataFrame assert len(transformed["a"]) == len(simple_dataframe["a"])
def test_binarizer2_df(): """ Check level names from LabelBinarizer with just one output column """ df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']}) mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(cols) == 1 assert cols[0] == 'target'
def test_default_transformer(): """ If default=Transformer, non explicitly selected columns are applied this transformer. """ df = pd.DataFrame({'a': [1, np.nan, 3], }) mapper = DataFrameMapper([], default=Imputer()) transformed = mapper.fit_transform(df) assert (transformed[: 0] == np.array([1., 2., 3.])).all()
def test_sparse_off(simple_dataframe): """ If the resulting features are sparse but the "sparse" argument of the mapper is False, return a non-sparse matrix. """ df = simple_dataframe mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=False) dmatrix = mapper.fit_transform(df) assert type(dmatrix) != sparse.csr.csr_matrix
def test_sparse_features(simple_dataframe): """ If any of the extracted features is sparse and "sparse" argument is true, the hstacked result is also sparse. """ df = simple_dataframe mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=True) dmatrix = mapper.fit_transform(df) assert type(dmatrix) == sparse.csr.csr_matrix
def scale_X(X, dataset): if dataset == 'noYelp': X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns) else: #use sklearn pandas data mapper to scale only non binary columns mapper = DataFrameMapper([(['yelp_rating'], StandardScaler()), (['yelp_reviews'], StandardScaler()), (['risk'], StandardScaler()), (['insp_badge'], StandardScaler()), (['crime_count'], StandardScaler()), (['311_count'], StandardScaler()), (['construction_count'], StandardScaler()), (['avg_high_temp'], StandardScaler()), (['time_diff'], StandardScaler()), (['prev_crit_viol'], StandardScaler()), ('Burgers', None), ('Convenience Stores', None), ('Sandwiches', None), ('Wine & Spirits', None), ('adultentertainment', None), ('afghani', None), ('african', None), ('apartments', None), ('asianfusion', None), ('bagels', None), ('bakeries', None), ('bangladeshi', None), ('bars', None), ('bbq', None), ('beerbar', None), ('beergardens', None), ('belgian', None), ('brasseries', None), ('breakfast_brunch', None), ('breweries', None), ('british', None), ('buffets', None), ('burgers', None), ('burmese', None), ('cafes', None), ('cafeteria', None), ('cajun', None), ('catering', None), ('cheesesteaks', None), ('chicken_wings', None), ('chinese', None), ('chocolate', None), ('churches', None),('cocktailbars', None), ('coffee', None), ('coffeeroasteries', None), ('comfortfood', None), ('cookingschools', None), ('creperies', None), ('cuban', None), ('cupcakes', None), ('danceclubs', None), ('delis', None), ('desserts', None), ('diners', None), ('discountstore', None), ('divebars', None), ('donuts', None), ('drugstores', None), ('ethiopian', None), ('ethnicmarkets', None), ('falafel', None), ('foodtrucks', None), ('french', None), ('gastropubs', None), ('gelato', None), ('german', None), ('gluten_free', None), ('golf', None), ('gourmet', None), ('greek', None), ('grocery', None), ('gyms', None), ('halal', None), ('healthtrainers', None), ('hookah_bars', None), ('hotdog', None), ('hotdogs', None), ('hotels', None), ('icecream', None), ('indpak', None), ('irish', None), ('irish_pubs', None), ('italian', None), ('japanese', None), ('jazzandblues', None), ('juicebars', None), ('korean', None), ('landmarks', None), ('latin', None), ('lawyers', None), ('lebanese', None), ('libraries', None), ('lounges', None), ('mediterranean', None), ('mexican', None), ('mideastern', None), ('mini_golf', None), ('modern_european', None), ('musicvenues', None), ('newamerican', None), ('nonprofit', None), ('pakistani', None), ('peruvian', None), ('pianobars', None), ('pizza', None), ('publicservicesgovt', None), ('pubs', None), ('puertorican', None), ('restaurants', None), ('salad', None), ('salvadoran', None), ('sandwiches', None), ('seafood', None), ('social_clubs', None), ('soulfood', None), ('soup', None), ('southern', None), ('spanish', None), ('sports_clubs', None), ('sportsbars', None), ('steak', None), ('sushi', None), ('tapas', None), ('tapasmallplates', None), ('tea', None), ('tex-mex', None), ('thai', None), ('tobaccoshops', None), ('tradamerican', None), ('turkish', None), ('vegetarian', None), ('venues', None), ('vietnamese', None), ('wholesale_stores', None), ('wine_bars', None)]) X_scaled = pd.DataFrame(mapper.fit_transform(X.copy()), columns=X.columns) print "\n data scaled\n" return X_scaled
def test_multiindex_df(multiindex_dataframe_incomplete): """ Get a dataframe from a multiindex dataframe with missing data """ df = multiindex_dataframe_incomplete mapper = DataFrameMapper([([c], Imputer()) for c in df.columns], df_out=True) transformed = mapper.fit_transform(df) assert len(transformed) == len(multiindex_dataframe_incomplete) for c in df.columns: assert len(transformed[str(c)]) == len(df[c])
def test_default_false(): """ If default=False, non explicitly selected columns are discarded. """ df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) mapper = DataFrameMapper([ ('b', None) ], default=False) transformed = mapper.fit_transform(df) assert transformed.shape == (3, 1)
def test_fit_transform_equiv_mock(simple_dataframe): """ Check for equivalent results for code paths fit_transform versus fit and transform in DataFrameMapper using the mock transformer which does not implement a custom fit_transform. """ df = simple_dataframe mapper = DataFrameMapper([('a', MockXTransformer())]) transformed_combined = mapper.fit_transform(df) transformed_separate = mapper.fit(df).transform(df) assert np.all(transformed_combined == transformed_separate)
def test_onehot_df(): """ Check level ids from one-hot """ df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]}) mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(cols) == 4 assert cols[0] == 'target_0' assert cols[3] == 'target_3'
def test_default_none(): """ If default=None, non explicitly selected columns are passed through untransformed. """ df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) mapper = DataFrameMapper([ (['a'], OneHotEncoder()) ], default=None) transformed = mapper.fit_transform(df) assert (transformed[:, 3] == np.array([3, 5, 7]).T).all()
def test_preserve_df_index(): """ The index is preserved when df_out=True """ df = pd.DataFrame({'target': [1, 2, 3]}, index=['a', 'b', 'c']) mapper = DataFrameMapper([('target', None)], df_out=True) transformed = mapper.fit_transform(df) assert_array_equal(transformed.index, df.index)
def test_customtransform_df(): """ Check level ids from a transformer in which the number of classes is not equals to the number of output columns. """ df = pd.DataFrame({'target': [6, 5, 7, 5, 4, 8, 8]}) mapper = DataFrameMapper([(['target'], CustomTransformer())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(mapper.features[0][1].classes_) == 5 assert len(cols) == 1 assert cols[0] == 'target'
def test_binarizer_int_df(): """ Check level names from LabelBinarizer for a numeric array. """ df = pd.DataFrame({'target': [5, 5, 6, 6, 7, 5]}) mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(cols) == 3 assert cols[0] == 'target_5' assert cols[1] == 'target_6' assert cols[2] == 'target_7'
def test_binarizer_df(): """ Check level names from LabelBinarizer """ df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']}) mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) transformed = mapper.fit_transform(df) cols = transformed.columns assert len(cols) == 3 assert cols[0] == 'target_a' assert cols[1] == 'target_b' assert cols[2] == 'target_c'
def test_local_input_df_date_encoder(): """ When input_df is True we can apply a transformer that only works with pandas dataframes like a DateEncoder """ df = pd.DataFrame( {'dates': pd.date_range('2015-10-30', '2015-11-02')}) mapper = DataFrameMapper([ ('dates', DateEncoder(), {'input_df': True}) ], input_df=False) out = mapper.fit_transform(df) expected = np.array([ [2015, 10, 30], [2015, 10, 31], [2015, 11, 1], [2015, 11, 2] ]) assert_array_equal(out, expected)
def test_fit_with_required_y_arg(complex_dataframe): """ Transformers with a required y argument in the fit method are handled and perform correctly """ df = complex_dataframe mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))]) # fit, doesn't fail ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target']) # fit_transform ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target']) assert_array_equal(ft_arr, df[['feat1']].values) # transform t_arr = mapper.transform(df[['feat1', 'feat2']]) assert_array_equal(t_arr, df[['feat1']].values)
def preprocess_train(train): train_y = train.Survived.values mapper = DataFrameMapper([ ('Pclass', preprocessing.LabelBinarizer()), ('Sex', preprocessing.LabelBinarizer()), ('Age', None), ('SibSp', preprocessing.Binarizer()), ('Parch', preprocessing.Binarizer()), ('Embarked', preprocessing.LabelBinarizer()), ('Fare', None), ]) train_X = mapper.fit_transform(train) imputer = preprocessing.Imputer(strategy='mean') train_X = imputer.fit_transform(train_X) return train_X, train_y, mapper, imputer
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch): """ If input_df is True, the subsequent transformers get passed pandas objects instead of numpy arrays (given the previous transformers output pandas objects as well) """ df = simple_dataframe monkeypatch.setattr(MockTClassifier, 'fit', Mock()) monkeypatch.setattr(MockTClassifier, 'transform', Mock(return_value=pd.Series([1, 2, 3]))) mapper = DataFrameMapper( [('a', [MockXTransformer(), MockTClassifier()])], input_df=True) out = mapper.fit_transform(df) args, _ = MockTClassifier().fit.call_args assert isinstance(args[0], pd.Series) assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
def test_list_transformers(): """ Specifying a list of transformers applies them sequentially to the selected column. """ dataframe = pd.DataFrame({"a": [1, np.nan, 3], "b": [1, 5, 7]}) mapper = DataFrameMapper([ (["a"], [Imputer(), StandardScaler()]), (["b"], StandardScaler()), ]) dmatrix = mapper.fit_transform(dataframe) assert pd.isnull(dmatrix).sum() == 0 # no null values # all features have mean 0 and std deviation 1 (standardized) assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all() assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
def testOptimizeBestModel(): X, y = make_classification(n_samples=100, n_features=20, n_informative=2) Xall = pd.concat( [pd.DataFrame(X), pd.DataFrame(y, columns=['target'])], axis=1) conti_ftr = list(range(20)) datamapper = DataFrameMapper([(conti_ftr, [ ContinuousDomain(invalid_value_treatment='as_is', missing_value_treatment='as_mean'), Imputer() ])], df_out=True) X_ = datamapper.fit_transform(X) lgb = BinaryClassifier("LightGBM") bestskopt, trace = lgb.optimizeBestModel(Xall, datamapper=datamapper, target='target', search_alg="GP", n_calls=10)
def readDataset(self): train_df = pd.read_csv(self.trainFile) test_df = pd.read_csv(self.testFile) #print(train_df.columns) #print(train_df.head()) #print(test_df.columns) self.test_index = test_df.Id train_df = train_df.astype(float) test_df = test_df.astype(float) #print(train_df.iloc[0].values) mapper = DataFrameMapper([ ([ 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points' ], MinMaxScaler()), ([ 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40' ], None) ]) self.X_train = mapper.fit_transform(train_df) # print(X_train[0:2,:]) self.y_train = train_df.Cover_Type.values # print(y_train[0:10]) self.X_test = mapper.transform(test_df)
def data_simple_imputer(data_train, numeric_feature, category_feature, numeric_strategy='mean', category_strategy='most_frequent', data_test=None): ''' 使用DataFrameMapper进行简单的缺失值填补 指定类别型变量和连续型变量 并指定各自的填充策略 data_train: 需要进行转换的训练集 numeric_feature: 需要处理的数值型变量 category_feature: 需要处理的类别型变量 numeric_strategy: 连续型变量的填补策略 默认是均值 category_strategy: 类别型变量的填补策略 默认是众数 data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换 return: X_train_imputed 添补完成的训练数据 miss_transfer 训练好的DataFrameMapper类 X_test_imputed 添补完成的测试数据 只有在给定测试数据的时候才会使用 ''' print('开始缺失值填充'.center(50, '=')) ##从dict里面把特征list拿出来 print('类别特征数', len(category_feature)) print('数值特征数', len(numeric_feature)) ##数值列和类别列用指定的方法填充 miss_transfer = DataFrameMapper([ (numeric_feature, [SimpleImputer(strategy=numeric_strategy)]), (category_feature, [SimpleImputer(strategy=category_strategy)]) ]) ##进行fit和transform X_train_imputed = miss_transfer.fit_transform(data_train[numeric_feature + category_feature]) X_train_imputed = pd.DataFrame(X_train_imputed, columns=numeric_feature + category_feature) print('train_mapper完成:', X_train_imputed.shape) ##如果测试数据不为空 那么对测试数据进行transform 并返回 if data_test is not None: X_test_imputed = miss_transfer.transform(data_test[numeric_feature + category_feature]) X_test_imputed = pd.DataFrame(X_test_imputed, columns=numeric_feature + category_feature) return X_train_imputed, miss_transfer, X_test_imputed return X_train_imputed, miss_transfer
def prepare_pseudobs_simu(df_train, y_train, df_test,name): """ Prepare the data for training The input data is formated so that one line corresponds to one subject at a particular time point. # Arguments df_train: the entire dataset (input + survival times + event status) y_train: the pseudo-values computed according to the method chosen. df_test: the entire dataset (input + survival times + event status) # Returns x_train_all: input data with all input variables + time variable and one line represents one subject at one time point. y_train_all: pseudo-values computed according to the method chosen. x_test_all: input data with all input variables + time variable and one line represents one subject at one time point. y_test_all: survival time and event status. n_picktime: the number of time point at which the pseudo-observations are computed. """ y_test_all = df_test[['yy','status']] n_picktime = int(y_train[['s']].apply(pd.Series.nunique)) x_test = df_test.drop(['yy','status'], axis = 1) x_test_all = pd.concat([x_test]*n_picktime) time_test = pd.DataFrame(np.repeat(np.unique(y_train[['s']]),len(x_test))) x_test_all.reset_index(inplace=True, drop=True) x_test_all = pd.concat([x_test_all, time_test], axis = 1) if name!= "pseudo_discrete": x_train = df_train.drop(['yy','status'], axis = 1) x_train_all = pd.concat([x_train]*n_picktime) x_train_all.reset_index(inplace=True, drop=True) x_train_all = pd.concat([x_train_all, y_train[['s']]], axis = 1) y_train_all = y_train[['pseudost']] else: x_train = df_train.drop(['yy','status'], axis = 1) x_train['id'] = np.arange(len(x_train)) + 1 x_train = x_train.merge(y_train, left_on='id', right_on='id') x_train_all = x_train.drop(['id','pseudost'], axis = 1) y_train_all = x_train['pseudost'] # Data normalization col_list = list(x_train_all.columns) x_test_all.columns = col_list cols_standardize = [e for e in col_list] standardize = [([col], StandardScaler()) for col in cols_standardize] x_mapper = DataFrameMapper(standardize, df_out=True) x_train_all = x_mapper.fit_transform(x_train_all).astype('float32') x_test_all = x_mapper.transform(x_test_all).astype('float32') return(x_train_all, y_train_all, x_test_all, y_test_all, n_picktime)
def test_input_df_true_first_transformer(simple_dataframe, monkeypatch): """ If input_df is True, the first transformer is passed a pd.Series instead of an np.array """ df = simple_dataframe monkeypatch.setattr(MockXTransformer, 'fit', Mock()) monkeypatch.setattr(MockXTransformer, 'transform', Mock(return_value=np.array([1, 2, 3]))) mapper = DataFrameMapper([('a', MockXTransformer())], input_df=True) out = mapper.fit_transform(df) args, _ = MockXTransformer().fit.call_args assert isinstance(args[0], pd.Series) args, _ = MockXTransformer().transform.call_args assert isinstance(args[0], pd.Series) assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
def init_xgb_mapper(df, xgb_pickle=RealMasterServiceConfig.XGB_MAPPER): mapper = DataFrameMapper([ ('A_c', sklearn.preprocessing.LabelBinarizer()), ('Bsmt1_out', sklearn.preprocessing.LabelBinarizer()), ('Community', sklearn.preprocessing.LabelBinarizer()), ('Gar_type', sklearn.preprocessing.LabelBinarizer()), ('Heating', sklearn.preprocessing.LabelBinarizer()), ('Pool', sklearn.preprocessing.LabelBinarizer()), ('Style', sklearn.preprocessing.LabelBinarizer()), ('Type_own1_out', sklearn.preprocessing.LabelBinarizer()), ('Den_fr', sklearn.preprocessing.LabelBinarizer()), (['Dom'], sklearn.preprocessing.StandardScaler()), (['Taxes'], sklearn.preprocessing.StandardScaler()), (['Area_code'], sklearn.preprocessing.StandardScaler()), (['Depth'], sklearn.preprocessing.StandardScaler()), (['Front_ft'], sklearn.preprocessing.StandardScaler()), (['Bath_tot'], sklearn.preprocessing.StandardScaler()), (['Br'], sklearn.preprocessing.StandardScaler()), (['Br_plus'], sklearn.preprocessing.StandardScaler()), (['Park_spcs'], sklearn.preprocessing.StandardScaler()), (['Kit_plus'], sklearn.preprocessing.StandardScaler()), (['Rms'], sklearn.preprocessing.StandardScaler()), (['Rooms_plus'], sklearn.preprocessing.StandardScaler()), (['Garage'], sklearn.preprocessing.StandardScaler()), (['lat'], sklearn.preprocessing.StandardScaler()), (['lng'], sklearn.preprocessing.StandardScaler()), (['Lp_dol'], sklearn.preprocessing.StandardScaler()), (constants.DISCRETE_ROOM_AREA, None), (constants.MONTH, None), ], input_df=True) data_temp = np.round(mapper.fit_transform(df.copy()).astype(np.double), 3) if check_local_file_exist(xgb_pickle): os.remove(xgb_pickle) with open(xgb_pickle, "wb") as f: pickle.dump(mapper, f) print("Fitting: ", type(mapper)) del data_temp gc.collect() return
def init_mapper(df, mapper_path): mapper = DataFrameMapper([ (['start_lat'], sklearn.preprocessing.MinMaxScaler()), (['start_lng'], sklearn.preprocessing.MinMaxScaler()), (['end_lat'], sklearn.preprocessing.MinMaxScaler()), (['end_lng'], sklearn.preprocessing.MinMaxScaler()), (['month'], sklearn.preprocessing.MinMaxScaler()), (['day'], sklearn.preprocessing.MinMaxScaler()), (['weekday'], sklearn.preprocessing.MinMaxScaler()), (['time'], sklearn.preprocessing.MinMaxScaler()), ], df_out=True) data_mapper = np.round(mapper.fit_transform(df.copy()).astype(np.double), 3) if os.path.isfile(mapper_path): os.remove(mapper_path) with open(mapper_path, "wb") as f: pickle.dump(mapper, f) print("Fitting: ", type(mapper)) return data_mapper
def test_preserve_df_index_rows_dropped(): """ If df_out=True but the original df index length doesn't match the number of final rows, use a numeric index """ class DropLastRowTransformer(object): def fit(self, X): return self def transform(self, X): return X[:-1] df = pd.DataFrame({'target': [1, 2, 3]}, index=['a', 'b', 'c']) mapper = DataFrameMapper([('target', DropLastRowTransformer())], df_out=True) transformed = mapper.fit_transform(df) assert_array_equal(transformed.index, np.array([0, 1]))
def preprocess_num(dataset_input, num_cols): numbers = num_cols categories = [col for col in list(dataset_input.columns) if col not in numbers][1:] means_dict = {} std_dict = {} #save means and std of fit for i in numbers: means_dict[i] = dataset_input.ix[:,i].mean() std_dict[i] = dataset_input.ix[:,i].std(ddof=0) mapper = DataFrameMapper( [('Date', None)] + [(category, None) for category in categories] + [(number, preprocessing.StandardScaler()) for number in numbers], df_out = True) transformedData = mapper.fit_transform(dataset_input) return transformedData, mapper, means_dict, std_dict
def transform_data(self, df, runtime_label): df_features, df_labels = df, df.pop(runtime_label) # Define which features are going to be transformed to a range of 0 to 1 (continuous) nfeats = gen_features( columns=[[i] for i in list(df_features.select_dtypes(include=[float]))], classes=[sklearn.preprocessing.MinMaxScaler] ) # Define which features are going to be binarized (categorical) sfeats = gen_features( columns=list(df.select_dtypes(include=[object])), classes=[sklearn.preprocessing.LabelBinarizer] ) # Do the transformations defined above mapper = DataFrameMapper(nfeats+sfeats,df_out=True) df_features = mapper.fit_transform(df_features) return df_features, df_labels
def scale(self, features_of_type='numerical', return_series=False): """ Scales numerical features in the dataset, unless the parameter 'what' specifies any other subset selection primitive. :param features_of_type: Subset selection primitive :return: the subset scaled. """ assert features_of_type in self.meta_tags subset = self.select(features_of_type) mapper = DataFrameMapper([(subset.columns, StandardScaler())]) scaled_features = mapper.fit_transform(subset.copy()) self.features[self.names(features_of_type)] = pd.DataFrame( scaled_features, index=subset.index, columns=subset.columns) self.metainfo() if return_series is True: return self.features[self.names(features_of_type)] else: return self
def gen_feature_specific_df(df, pivotfeature, feature, sex=None): from sklearn.preprocessing import LabelBinarizer from sklearn_pandas import DataFrameMapper columns = [(pivotfeature, None), (feature, LabelBinarizer()), ('sex', None)] mapper = DataFrameMapper(columns, df_out=True) df = mapper.fit_transform(df.copy()) cols = df.columns rename_map = {} for col in cols: rename_map[col] = col.replace(feature + "_", "") df = df.rename(columns=rename_map) if sex is not None: df = df[df['sex'] == sex] return df
def fix_skewness(self, features_of_type='numerical', return_series=False): """ Ensures that the numerical features in the dataset, unless the parameter 'what' specifies any other subset selection primitive, fit into a normal distribution by applying the Yeo-Johnson transform :param features_of_type: Subset selection primitive :param return_series: Return the normalized series :return: the subset fitted to normal distribution. """ assert features_of_type in self.meta_tags subset = self.select(features_of_type) mapper = DataFrameMapper([(subset.columns, PowerTransformer(method='yeo-johnson', standardize=False))]) normed_features = mapper.fit_transform(subset.copy()) self.features[self.names(features_of_type)] = pd.DataFrame( normed_features, index=subset.index, columns=subset.columns) self.update() if return_series is True: return self.features[self.names(features_of_type)]
def preprocess_train_data(x, tr): """ Impute, binarize, scale an input dataframe. Save the transformation. :param pandas.DataFrame x: dataframe to preprocess :param tuple tr: the transformation rule/code for preprocessing :return: the preprocessed dataframe xt and transformation details """ from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer from sklearn_pandas import DataFrameMapper map_instructions = list() if 'continuous_vars' in tr: map_instructions.extend([([v], [CatchAllNAN(), Imputer(strategy='mean'), StandardScaler()]) for v in tr['continuous_vars']]) if 'categorical_vars' in tr: # map_instructions.extend([([v], [MapToStr(), LabelBinarizer()]) for v in tr['categorical_vars']]) map_instructions.extend([([v], ToDummiesWrapper()) for v in tr['categorical_vars']]) if 'binary_vars' in tr: map_instructions.extend([([v], [CatchAllNAN(), Imputer(strategy='most_frequent'), MinMaxScaler()]) for v in tr['binary_vars']]) mapper = DataFrameMapper(map_instructions) xt = mapper.fit_transform(x) # get column names column_names = list() for feature in mapper.features: has_classes_flag = getattr(feature[1], "classes_", None) original_feature_name = feature[0][0] if has_classes_flag is None: column_names.extend(original_feature_name) else: class_names = feature[1].classes_ column_names.extend([original_feature_name+'_'+str(sub) for sub in class_names]) xt = pd.DataFrame(xt, columns=column_names) return xt, mapper, column_names
def compute_cross_correlation_score(df, clfs, preprocess_scaling=True, nFold=10): """ return an iterator with cross validation data :param df: :param clfs: :param preprocess_scaling: :param nFold: :return: """ to_sklearn_features = DataFrameMapper([ ('features', sklearn.feature_extraction.DictVectorizer()) ]) data_X = to_sklearn_features.fit_transform(df) data_Y = df.expected_class skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold) classification_results = [] scores = [] for num, (train_index, test_index) in enumerate(skf): X_train, X_test = data_X[train_index], data_X[test_index] Y_train, Y_test = data_Y[train_index], data_Y[test_index] print("Len train{}, Len test{}".format(Y_train.size, Y_test.size)) cross_valid_data = Cross_validation_split(X_train, X_test, Y_train, Y_test) cross_valid_data = preprocess(cross_valid_data, preprocess_scaling=preprocess_scaling, preprocess_correlation=False) for clf in clfs: score, classification = generate_score(clf, cross_valid_data, fold=num) scores.append(score) classification_results.append(classification) return scores, classification_results
def _scale_data(self, data, dummy_cols): """ Scale the dummy columns and add scaled numerical columns to the original dataframe. The function first loop through each numerical column and create a copy of it with '_scaled' ending in the name. All these created names are added to an array. Afterwards are all dummy columns and duplicates of numerical columns scalled and replaced in the original dataframe. Parameters ------- data: DataFrame DataFrame with the data. dummy_cols: array Array of names of dummy columns created from passed categorical columns. Returns ------- data: DataFrame DataFrame with all the loaded columns extended by dummy encoded and scaled categorical columns and scaled numerical columns scaled_cols: array Array of names scaled numerical columns. scaler_mapper: obj Fitted standard scaler, wrapped in DataFrameMapper, so that its output is dataframe instead of a numpy array. """ scaled_cols = [] for col in self.num_cols: column = str(col + '_scaled') scaled_cols.append(column) data[column] = data[col] df = data[dummy_cols + scaled_cols] scaler_mapper = DataFrameMapper([(df.columns, StandardScaler())]) scaled_features = scaler_mapper.fit_transform(df.copy()) data[dummy_cols + scaled_cols] = pd.DataFrame(scaled_features, index=df.index, columns=df.columns) return data, scaled_cols, scaler_mapper
def cluster(data_frame, file_wr): global num_clusters f = file_wr file_str = "" preprocessors = [ preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=1) ] data_columns = list(data_frame.columns.values) time_stmp = str(data_frame['timestamp'].iloc[0]) data_columns.remove("timestamp") data_data_mapper = DataFrameMapper([(data_columns, preprocessors)]) geo_data = data_data_mapper.fit_transform(data_frame) pred = KMeansCluster() pred.fit(geo_data) cluster_array = { i: geo_data[np.where(pred.model.labels_ == i)] for i in range(num_clusters) } radious_list = [] for i in range(num_clusters): file_str += "['" + time_stmp + "','" cluster_val = cluster_array[i] cluster_centre = pred.model.cluster_centers_[i] distance_arr = [] for point in cluster_val: distance_arr.append(global_distance(cluster_centre, point)) radious_list.append(get_percentile_threshold(0.8, distance_arr)) file_str += str(cluster_centre[0]) + "','" + str( cluster_centre[1]) + "'," file_str += str(radious_list[i]) + "," file_str += str(i) + "]," file_wr.write(file_str)
def test_scikit_pandas(self): from sklearn_pandas import DataFrameMapper df = pandas.DataFrame({ 'feat1': [1, 2, 3, 4, 5, 6], 'feat2': [1.0, 2.0, 3.0, 2.0, 3.0, 4.0] }) mapper = DataFrameMapper([(['feat1', 'feat2'], StandardScaler()), (['feat1', 'feat2'], MinMaxScaler())]) df2 = mapper.fit_transform(df) try: model_onnx = convert_sklearn( mapper, 'predictable_tsne', [('input', FloatTensorType([1, df.shape[1]]))], custom_shape_calculators={ DataFrameMapper: dataframe_mapper_shape_calculator }) except RuntimeError as e: assert "DataFrameMapper has no associated parser." in str(e)
def DataPreprocessing(): #-------importing the csv file and storing it as a new df A = pd.read_excel('expdata.xlsx', encoding='latin-1') #-----------Tokenizing the subjectline and storing in a separate column A['Tokens'] = lp.stemmingTokenize(A.EmailSubject) #--------------calculating the count of words and storing in a separate column A['wordcount'] = list(map(lp.countwords, A.EmailSubject)) #-----------------calculating open rate and naming it as targetopenrate A['target_open_rate'] = A['Email_Opened'] / A['EmailDelivered'] #--------------------Area to operate : list_words = list(itertools.chain.from_iterable(A.loc[:, 'Tokens'])) set_words = set(list_words) len(set_words) #f=lp.getWordFrequency(list_words) #word_features = list(f.keys())[:200] #>>> Work on getting only the top n frequent words #-----------------------creating word vectors mapper = DataFrameMapper([ ('wordcount', None), ('EmailSubject', TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1, 1), max_features=2000)), ]) X = mapper.fit_transform(A.copy()) #X,y split y = A['target_open_rate'] return (X, y, A)
def test_whole(self): # Create datatype datatype = Categorical() # Load observations observations = lib.load_mushroom() # Transform observations mapper = DataFrameMapper( [(['cap-shape'], datatype.default_transformation_pipeline)], df_out=True) transformed_df = mapper.fit_transform(observations) # Create network input_layer, input_nub = datatype.input_nub_generator( 'cap-shape', transformed_df) output_nub = datatype.output_nub_generator('cap-shape', transformed_df) x = input_nub x = output_nub(x) model = Model(input_layer, x) model.compile(optimizer='adam', loss=datatype.output_suggested_loss())
def _create_apply_transformers(df): from sklearn_pandas import DataFrameMapper import category_encoders as ce data_raw = df obj_cols = data_raw.select_dtypes("object").columns.to_list() from sklearn_pandas import gen_features feature_def = gen_features( columns=obj_cols, classes=[{ "class": ce.OrdinalEncoder, "handle_unknown": "return_nan", "handle_missing": "return_nan" }], ) mapper = DataFrameMapper(feature_def, default=None, df_out=True) data_transformed = mapper.fit_transform(data_raw) return data_transformed, mapper
def generate_multi_term(data): """ 输入:原始数据集-DataFrame 输出:包含所有变量的交乘项的数据集-DataFrame """ mapper = DataFrameMapper([(list(data.columns), [ preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) ])]) # 生成交乘项 data_tr = mapper.fit_transform(data.copy()) # 生成交乘项的列名 names_tr = [] for name in mapper.transformed_names_: names = map(int, re.findall(r'\d+', name)) name_tr = '' for i in names: name_tr = name_tr + list(data.columns)[i] + '*' names_tr.append(name_tr[:-1]) return pd.DataFrame(data_tr, columns=names_tr)
def run(country): global MODEL global CURR_DATE # initialize init(country) cities = get_cities() # print(cities) news = get_cleaned_news(cities) sentiments = get_sentiment(cities) # df = pd.DataFrame({'news': news, 'sentiment_score': sentiments}) # src_last = DATASET['src'][-1*len(cities):] # print(src_last) # df_last = DATASET[-1*len(cities):] mapper = DataFrameMapper([ ('news', TfidfVectorizer()), ('sentiment_score', None), # ('src', None) ]) # today_records = DATASET.index[DATASET['src'].startswith('{}'.format(CURR_DATE))].tolist() # print('{}'.format(CURR_DATE)) df = DATASET[DATASET.src.str.startswith('{}'.format(CURR_DATE))] print(df) # print(DATASET.describe(include='all')) X = mapper.fit_transform(DATASET) # X_live = X[X.src.str.startswith('{}'.format(CURR_DATE))] # X_live.drop(['src'], axis=1, inplace=True) print(MODEL.predict(X))
def make_scaling_mapper(dumpf, scaling_mapper): df = pd.read_json(dumpf) mapper = DataFrameMapper([ (['goodfaith_scores_mean'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_var'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_max'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_min'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_reg_slope'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_reg_intercept'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_count'], sklearn.preprocessing.StandardScaler()), (['goodfaith_scores_count_log'], sklearn.preprocessing.StandardScaler()), (['goodfaith_timestamps_total_seconds'], sklearn.preprocessing.StandardScaler()), (['goodfaith_timestamps_variance'], sklearn.preprocessing.StandardScaler()), (['goodfaith_timestamps_min'], sklearn.preprocessing.StandardScaler()), (['goodfaith_timestamps_max'], sklearn.preprocessing.StandardScaler()), (['self_reverts'], sklearn.preprocessing.StandardScaler()), (['edit_wars'], sklearn.preprocessing.StandardScaler()), (['pages_unique_count'], sklearn.preprocessing.StandardScaler()), (['pages_namespace_count'], sklearn.preprocessing.StandardScaler()), (['pages_nonmain_count'], sklearn.preprocessing.StandardScaler()), (['pages_talk_count'], sklearn.preprocessing.StandardScaler()), (['singleton_session'], sklearn.preprocessing.StandardScaler()), (['first_edit_ores_goodfaith'], sklearn.preprocessing.StandardScaler()), (['first_edit_ores_damaging'], sklearn.preprocessing.StandardScaler()), (['any_edit_ores_goodfaith'], sklearn.preprocessing.StandardScaler()), (['any_edit_ores_damaging'], sklearn.preprocessing.StandardScaler()), ]) data = mapper.fit_transform(df.copy()) pickle.dump(mapper, open(scaling_mapper, 'wb'))
def full(): df = pd.read_csv("data/train.csv", dtype={'StateHoliday': str}) mapper_data = DataFrameMapper([ # (['Store'], preprocessing.LabelBinarizer()), (['Customers'], preprocessing.StandardScaler()) ]) np_target = np.asarray(df['Sales']) np_data = mapper_data.fit_transform(df.copy()) # reduce compution time np_target = np_target[:10000] np_data = np_data[:10000] X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split( np_data, np_target, test_size=0.1, random_state=0) print("train size: ", X_train.shape) print("test size: ", X_test.shape) print("train data: ", X_train[:2]) print("target data: ", y_train[:2]) # clf = sklearn.linear_model.SGDClassifier(alpha=0.001, n_iter=20).fit(X_train, y_train) clf = sklearn.svm.SVC(C=1).fit(X_train, y_train) print(clf.score(X_test, y_test))