def preprocessing(data,target): global statistics_row print('Start preprocessing') data = data.drop('imports', axis='columns') target = target.drop('imports', axis='columns') labels = data['BUGFIX_count']>0 test_labels = target['BUGFIX_count']>0 test_loc = target['SM_file_lloc'] print('Start Random Forest') predictions = random_forest(data.iloc[:, :data.columns.get_loc('BUGFIX_count')], labels, target.iloc[:, :target.columns.get_loc('BUGFIX_count')]) lb_infinite = False try: #if no defects predicted exception through division by zero lb = lower_bound(predictions, test_loc, test_labels) except Exception as e: print(e) #lower bound infinite statistics_row.append(-1) lb_infinite = True print('RF-b:', 'lb infinite') if not lb_infinite: try: ub = upper_bound(predictions, test_loc, test_labels) #scaling difference depending on positivity or negativity if(ub-lb)<0: statistics_row.append(-(abs(ub - lb) / (abs(ub - lb) + 1000))) else: statistics_row.append((ub - lb) / ((ub - lb) + 1000)) print(lb, '< C <', ub) except Exception as e: print(e) #upper bound infinite statistics_row.append(1) print('RF-b:','ub infinite') #selecting features with random forest features = select_features_rf(data.iloc[:, :data.columns.get_loc('BUGFIX_count')], labels) #applying features to data test_data = target[features] data = data[features] columns = data.columns try: #defining function (sigmoid) for scaling data scaler = FunctionTransformer(func=lambda x: (np.e ** x) /((np.e ** x) + 1), validate=True) except RuntimeWarning as e: print(e) #applying sigmoid function on data data = pd.DataFrame(scaler.fit_transform(data), columns=columns).fillna(0) test_data = pd.DataFrame(scaler.fit_transform(test_data),columns=columns).fillna(0) labels = pd.DataFrame(labels).reset_index(drop=True) data = pd.concat([data, labels], axis=1) data, labels = knn_data_selection(data, test_data) return data, labels, test_data, test_labels, test_loc
def test_function_transformer_feature_names_out_is_None(): transformer = FunctionTransformer() X = np.random.rand(100, 2) transformer.fit_transform(X) msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'" with pytest.raises(AttributeError, match=msg): transformer.get_feature_names_out()
def test_function_transformer_feature_names_out_string(feature_names_out): transformer = FunctionTransformer(feature_names_out=feature_names_out) X = np.random.rand(100, 2) transformer.fit_transform(X) msg = """must either be "one-to-one" or a callable""" with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out()
def test_function_transformer_future_warning(validate, expected_warning): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) transformer = FunctionTransformer(validate=validate) with pytest.warns(expected_warning) as results: transformer.fit_transform(X) if expected_warning is None: assert len(results) == 0
def test_function_transformer_get_feature_names_out_without_validation(): transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False) X = np.random.rand(100, 2) transformer.fit_transform(X) msg = "When 'feature_names_out' is 'one-to-one', either" with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() names = transformer.get_feature_names_out(("a", "b")) assert isinstance(names, np.ndarray) assert names.dtype == object assert_array_equal(names, ("a", "b"))
def test_function_transformer_get_feature_names_out( X, feature_names_out, input_features, expected ): if isinstance(X, dict): pd = pytest.importorskip("pandas") X = pd.DataFrame(X) transformer = FunctionTransformer( feature_names_out=feature_names_out, validate=True ) transformer.fit_transform(X) names = transformer.get_feature_names_out(input_features) assert isinstance(names, np.ndarray) assert names.dtype == object assert_array_equal(names, expected)
def logarithmic_regression(input_data, cement, water, coarse_aggr, fine_aggr, days): variables = input_data.iloc[:, :-1] results = input_data.iloc[:, -1] n = results.shape[0] results = results.values.reshape( n, 1 ) #reshaping the values so that variables and results have the same shape #transforming x data to logarithmic fucntion log_regression = FunctionTransformer(np.log, validate=True) log_variables = log_regression.fit_transform(variables) #making linear model and fitting the logarithmic data into linear model regression = linear_model.LinearRegression() model = regression.fit(log_variables, results) input_values = [cement, water, coarse_aggr, fine_aggr, days] #transforming input data for prediction in logarithmic function input_values = log_regression.transform([input_values]) #predicting the outcome based on the input_values predicted_strength = regression.predict( input_values) #adding values for prediction predicted_strength = round(predicted_strength[0, 0], 2) return "Logarithmic prediction: " + str(predicted_strength)
def transform(self, X): X_transformed = self.kbd_fitted.transform(X).astype(int) pre = FunctionTransformer(_unpack_bits, validate=False, kw_args={'nbits': self.max_bits_per_feature}) return pre.fit_transform(X_transformed)
def gen_statistics(): # Read weather data (used to find list of airports) weather_df = pd.read_csv('../data/airport_per_date/weather.csv') weather_df['DateOfDeparture'] = pd.to_datetime(weather_df['Date']) weather_df.drop( ['Date', 'Events', 'Max Gust SpeedKm/h', 'Precipitationmm'], axis=1, inplace=True) # Initialise list of airports and date range for period of interest and encode dates for easier merge airport_list = weather_df['AirPort'].unique() airport = pd.DataFrame(airport_list, columns=['AirPort']) date_list = pd.date_range(start='01/01/2011', end='05/03/2013') date_airports = pd.DataFrame(list(product(date_list, airport_list)), columns=['DateOfDeparture', 'AirPort']) # Merge weather information merge_transform = MergeTransformer(X_ext=weather_df, how='left', on=['DateOfDeparture', 'AirPort']) date_airports = merge_transform.fit_transform(date_airports) date_encoder = FunctionTransformer(_encode_dates) date_airports = date_encoder.fit_transform(date_airports) # Merge LoadFactor and Passenger statistics per airport and date airport_statistics = pd.read_csv( '../data/airport_per_date/airports_statistics.csv', sep=',') merge_transform = MergeTransformer(X_ext=airport_statistics, how='left', on=['year', 'month', 'AirPort']) date_airports = merge_transform.fit_transform(date_airports) websearches = pd.read_csv('../data/airport_per_date/websearches.csv', sep=';', index_col='DateOfDeparture') websearches = websearches.stack() websearches = pd.DataFrame(websearches).reset_index() websearches.rename({ 'level_1': 'AirPort', 0: 'search_intensity' }, axis=1, inplace=True) websearches['DateOfDeparture'] = pd.to_datetime( websearches['DateOfDeparture'], format='%d/%m/%Y') merge_transform = MergeTransformer(X_ext=websearches, how='left', on=['DateOfDeparture', 'AirPort']) date_airports = merge_transform.fit_transform(date_airports) date_airports.drop( ['year', 'month', 'day', 'weekday', 'week', 'n_days', 'day_nb'], axis=1, inplace=True) return date_airports
def labelize(column, t): if t == "string": raise ValueError("String valued labels are not supported") elif t == "categorical": label_map = {} for i, k in enumerate(set(column)): label_map[k] = i vectorizer = FunctionTransformer(lambda x: label_map[x[0, 0]]) return vectorizer.fit_transform(column), vectorizer else: vectorizer = FunctionTransformer(tryParse) return vectorizer.fit_transform(column), vectorizer
def rescale_cont_vars(df, log_transform=True, dep_var=None): cont_vars, _ = cont_cat_split(df, dep_var=dep_var) if log_transform: log_transformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=False) df[cont_vars] = log_transformer.fit_transform(df[cont_vars]) scaler = MinMaxScaler() df[cont_vars] = scaler.fit_transform(df[cont_vars]) return df
def test_wrapper_func_transformer(test_func): """Testing if WrapperFunctionTransformer still has functionality of an underlying FunctionTransformer.""" test_arr = np.array([1, 1, 1, 2, 3, 4, 5]).reshape(-1, 1) tr = FunctionTransformer(func=test_func) wrap_tr = WrapperFunctionTransformer("test", clone(tr)) expected_arr = tr.fit_transform(test_arr) actual_arr = wrap_tr.fit_transform(test_arr) assert np.array_equal(actual_arr, expected_arr) assert str(wrap_tr) != str(tr)
def test_check_inverse(): X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)] for X in X_list: if sparse.issparse(X): accept_sparse = True else: accept_sparse = False trans = FunctionTransformer( func=np.sqrt, inverse_func=np.around, accept_sparse=accept_sparse, check_inverse=True, validate=True, ) warning_message = ( "The provided functions are not strictly" " inverse of each other. If you are sure you" " want to proceed regardless, set" " 'check_inverse=False'." ) with pytest.warns(UserWarning, match=warning_message): trans.fit(X) trans = FunctionTransformer( func=np.expm1, inverse_func=np.log1p, accept_sparse=accept_sparse, check_inverse=True, validate=True, ) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) Xt = trans.fit_transform(X) assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) # check that we don't check inverse when one of the func or inverse is not # provided. trans = FunctionTransformer( func=np.expm1, inverse_func=None, check_inverse=True, validate=True ) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) trans.fit(X_dense) trans = FunctionTransformer( func=None, inverse_func=np.expm1, check_inverse=True, validate=True ) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) trans.fit(X_dense)
def function_transformer_example(sample_df): # Obtain the text data: get_text_data get_text_data = FunctionTransformer(lambda x: x['text'], validate=False) # Obtain the numeric data: get_numeric_data get_numeric_data = FunctionTransformer( lambda x: x[['numeric', 'with_missing']], validate=False) # Fit and transform the text data: just_text_data just_text_data = get_text_data.fit_transform(sample_df) # Fit and transform the numeric data: just_numeric_data just_numeric_data = get_numeric_data.fit_transform(sample_df) # Print head to check results print('Text Data') print(just_text_data.head()) print('\nNumeric Data') print(just_numeric_data.head()) X_train, X_test, y_train, y_test = train_test_split( sample_df[['numeric', 'with_missing', 'text']], pd.get_dummies(sample_df['label']), random_state=22) # Create a FeatureUnion with nested pipeline: process_and_join_features process_and_join_features = FeatureUnion( transformer_list=[('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])), ('text_features', Pipeline([( 'selector', get_text_data), ('vectorizer', CountVectorizer())]))]) # Instantiate nested pipeline: pl pl = Pipeline([('union', process_and_join_features), ('clf', OneVsRestClassifier(LogisticRegression()))])
def test_function_transformer_feature_names_out_uses_estimator(): def add_n_random_features(X, n): return np.concatenate([X, np.random.rand(len(X), n)], axis=1) def feature_names_out(transformer, input_features): n = transformer.kw_args["n"] return list(input_features) + [f"rnd{i}" for i in range(n)] transformer = FunctionTransformer( func=add_n_random_features, feature_names_out=feature_names_out, kw_args=dict(n=3), validate=True, ) pd = pytest.importorskip("pandas") df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)}) transformer.fit_transform(df) names = transformer.get_feature_names_out() assert isinstance(names, np.ndarray) assert names.dtype == object assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))
def log_scaler(x, convert_input_data=True, return_log_scaler=False): """ :param x: unscaled data (numpy array) :param convert_input_data: if True convert input data to a 2-dimensional NumPy array or sparse matrix :param return_std_scaler: boolean value which enable returning (or not) FunctionTransformer instance :return: scaled data (numpy array), FunctionTransformer instance (optional) """ log_transformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=convert_input_data) x_scaled = log_transformer.fit_transform(x) if return_log_scaler: return x_scaled, log_transformer return x_scaled
def plot_published_games_over_years(df, lb, ub, exponential_regression=True): """ Plot the overall #published games over years from lb to ub df: dataframe lb: yearpublished lower bound ub: yearpublished upper bound exponential_regression: a flag whether to plot an exponential regression line """ assert isinstance(df, pd.DataFrame) assert isinstance(lb, int) and ub > 0 assert isinstance(ub, int) and lb > 0 assert isinstance(exponential_regression, bool) # Filter the dataframe on yearpublished lower bound and upper bound filtered_df = df.loc[(df["yearpublished"] >= lb) & (df["yearpublished"] <= ub)] # Configure the pyplot setting fig = plt.figure(figsize=(15, 10)) sns.set(style="ticks") # Draw a exponential regression line if exponential_regression: transformer = FunctionTransformer(np.log, validate=True) counts = filtered_df.groupby("yearpublished").count()["id"] x = np.arange(len(counts))[:, None] y = counts[:, None] # Fit exponential model y_trans = transformer.fit_transform(y) regressor = LinearRegression() results = regressor.fit(x, y_trans) model = results.predict y_fit = model(x) plt.plot(x + lb, np.exp(y_fit), "k--", color="brown", linewidth=2) # Plot the histogram of published games p = sns.histplot(filtered_df["yearpublished"], discrete=True, stat="count", color="orange", edgecolor="white") p.set_xlabel("Year", fontsize=25, weight="bold") p.set_ylabel("Number of games", fontsize=25, weight="bold") p.tick_params(labelsize=20) p.set_xticks(p.get_xticks()[1:-2]) plt.gca().spines['right'].set_visible(False) plt.gca().spines['top'].set_visible(False) plt.show()
def scale_data(df, p, train=True, save=True): if p.log_scale: df.loc[df["last_pend_time"] == 0, "last_pend_time"] = 1 if train: log_scaler = FunctionTransformer(np.log2) df.loc[:, ["last_pend_time"]] = log_scaler.fit_transform( df[["last_pend_time"]]) if save: joblib.dump(log_scaler, "log_scaler.save") else: log_scaler = joblib.load("log_scaler.save") df.loc[:, ["last_pend_time"]] = log_scaler.transform( df[["last_pend_time"]]) scale_cols = ["last_pend_time"] if p.use_using_cores: scale_cols.append("using_cores") if p.use_spending_run_time: scale_cols.append("spending_run_time") if p.use_pending_jobs: scale_cols.append("pending_jobs") if p.use_last_pend_time_submit: scale_cols.append("last_pend_time_submit") if p.use_submit_time: scale_cols.append("sin_submit_time") scale_cols.append("cos_submit_time") if p.use_day_of_week: scale_cols.append("sin_day_of_week") scale_cols.append("cos_day_of_week") if train: min_max_scaler = MinMaxScaler(feature_range=(0, 1)) df.loc[:, scale_cols] = min_max_scaler.fit_transform(df[scale_cols]) if save: joblib.dump(min_max_scaler, "min_max_scaler.save") else: min_max_scaler = joblib.load("min_max_scaler.save") df.loc[:, scale_cols] = min_max_scaler.transform(df[scale_cols]) if p.standard_scale: if train: standard_scaler = StandardScaler() df.loc[:, scale_cols] = standard_scaler.fit_transform(df[scale_cols]) if save: joblib.dump(standard_scaler, "standard_scaler.save") else: standard_scaler = joblib.load("standard_scaler.save") df.loc[:, scale_cols] = standard_scaler.transform(df[scale_cols]) return df
def log1p(self, **params) -> pd.DataFrame: """ log1p 变换 :param params: :return: """ print('进行对数变换的特征列:') print(self.columns) transformer = FunctionTransformer(np.log1p) self.df[self.columns] = transformer.fit_transform( self.df[self.columns].values) return self.df
def main(params, inputs, outputs): ### 读入数据 ### x = pd.read_pickle(inputs.x) ### 定义函数:去除第一列数据 ### def all_but_first_column(X): return X[:, 1:] ### 使用FunctionTransformer训练并转化 ### ft = FunctionTransformer(all_but_first_column) x_new = ft.fit_transform(x) ### 结果输出 ### x_new.to_pickle(outputs.x_new)
def impute(): csv_data = '''A,B,C,D\n1.0,2.0,3.0,4.0\n5.0,6.0,,8.0\n10.0,11.0,12.0,''' df = pd.read_csv(StringIO(csv_data)) print(df) # calculate column mean imr = SimpleImputer(missing_values=np.nan, strategy='mean') imr = imr.fit(df.values) imputed_data = imr.transform(df.values) print(imputed_data) # calculate row mean ftr_imr = FunctionTransformer(lambda X: imr.fit_transform(X.T).T, validate=False) imputed_data = ftr_imr.fit_transform(df.values) print(imputed_data)
def df_to_exponential_fit(df, colX, colY, wgt=None): X = df[colX].values.reshape(-1, 1) # values converts it into a numpy array Y = df[colY].values.reshape( -1, 1) # -1 means that calculate the dimension of rows, but have 1 column # Y = np.log(df[colY].values.reshape(-1, 1)) # -1 means that calculate the dimension of rows, but have 1 column transformer = FunctionTransformer(np.log, validate=True) y_trans = transformer.fit_transform(Y) linear_regressor = LinearRegression() # create object for the class results = linear_regressor.fit(X, y_trans, sample_weight=wgt) linear_regressor.fit(X, y_trans, sample_weight=wgt) # perform linear regression Y_pred = linear_regressor.predict(X) # make predictions coef = float(linear_regressor.coef_) return Y_pred, coef
def test_function_transformer_validate_inverse(): """Test that function transformer does not reset estimator in `inverse_transform`.""" def add_constant_feature(X): X_one = np.ones((X.shape[0], 1)) return np.concatenate((X, X_one), axis=1) def inverse_add_constant(X): return X[:, :-1] X = np.array([[1, 2], [3, 4], [3, 4]]) trans = FunctionTransformer( func=add_constant_feature, inverse_func=inverse_add_constant, validate=True, ) X_trans = trans.fit_transform(X) assert trans.n_features_in_ == X.shape[1] trans.inverse_transform(X_trans) assert trans.n_features_in_ == X.shape[1]
def clean_df(X): date_encoder = FunctionTransformer(_encode_dates) X = date_encoder.fit_transform(X) X.rename( { 'year': 'year_departure', 'day': 'day_departure', 'n_days': 'n_days_departure' }, axis=1, inplace=True) columns = [ 'DateOfDeparture', 'DateBooked', 'state_dep', 'state_arr', 'week', 'month', 'weekday', 'holidays_dep', 'holidays_arr', 'Departure', 'Arrival' ] X.drop(columns, axis=1, inplace=True) return X
def test_function_transformer_frame(): pd = pytest.importorskip("pandas") X_df = pd.DataFrame(np.random.randn(100, 10)) transformer = FunctionTransformer() X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, "loc")
from sklearn.linear_model import LogisticRegression, Lasso from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import FunctionTransformer from sklearn.pipeline import FeatureUnion from sklearn.preprocessing import Imputer, StandardScaler from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV import pandas as pd import numpy as np func1 = FunctionTransformer(lambda x: x[x.columns[0]], validate=False) func2 = FunctionTransformer(lambda x: x[x.columns[2]], validate=False) union = FeatureUnion([('func1', make_pipeline(func1,Imputer())), ('func2', make_pipeline(func2,Imputer()))]) func1.fit_transform(df_text) func2.fit_transform(df_text) pd.DataFrame(union.fit_transform(df_text)) iris = load_iris() text = 'hi, my name is vishesh' df_text = pd.DataFrame(iris.data, columns=iris.feature_names) df_text['label'] = iris.target label_names = ['hello', 'man', 'namely'] df_text['text'] = df_text['label'].apply(lambda x: 'hi my name is {}'.format(label_names[x])) trainX, testX, trainY, testY = train_test_split(df_text, pd.get_dummies(df_text.label), test_size=.3) TOKEN = '\\S+(?=\\s+)' countvec = CountVectorizer(ngram_range = (1,3)) countvec_HASH = HashingVectorizer(token_pattern=TOKEN, ngram_range = (1,3), norm=None, non_negative=True)
# Hash function takes a token as input and outputs a hash value; we can limit the number of these values. Thus, each # hash value may have multiple tokens assigned to it. Interestingly, this has little effect on model accuracy. # Some problems are memory-bound and not easily parallelizable, and hashing enforces a fixed length computation instead # of using a mutable datatype (like a dictionary). text_data = combine_text_columns(X_train) TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' hashing_vec = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC) hashed_text = hashing_vec.fit_transform(text_data) # Pipeline for numeric and categorical variables ----------------------------------------------------------------------- # Any step in the pipeline must be an object that implements the fit and transform methods. The FunctionTransformer # creates an object with these methods out of any Python function that you pass to it. get_text_data = FunctionTransformer(lambda x: x['text_col'], validate=False) get_numeric_data = FunctionTransformer(lambda x: x[['numeric_col_1', 'numeric_col_2']], validate=False) just_text_data = get_text_data.fit_transform(df) just_numeric_data = get_numeric_data.fit_transform(df) # FeatureUnion joins results of multiple pipelines together process_and_join_features = FeatureUnion( transformer_list=[ ('numeric_features', Pipeline([ ('selector', get_numeric_data), ('imputer', Imputer()) ])), ('text_features', Pipeline([ ('selector', get_text_data), ('vectorizer', CountVectorizer()) ])) ] )
def predict_missing_price(preprocessed_data, one_hot=False): test_index = preprocessed_data.price != preprocessed_data.price feature_columns = [ i for i in preprocessed_data.columns if i not in ['class_id', 'price'] ] y_column = ['price'] testX = preprocessed_data.loc[test_index, feature_columns].values trainX = preprocessed_data.loc[(1 - test_index).astype(bool), feature_columns].values trainY = preprocessed_data.loc[(1 - test_index).astype(bool), y_column].values # plt.hist(trainY) # plt.show() # 销量数据使用log1p处理后更接近正态分布,比sqrt处理要好 # trs = FunctionTransformer(func=np.sqrt, inverse_func=np.square) trs = FunctionTransformer(func=np.log1p, inverse_func=np.expm1) scaler = MinMaxScaler() trainX = scaler.fit_transform(trainX) trainY = trs.fit_transform(np.reshape(trainY, (-1, 1))) # plt.hist(trainY) # plt.show() print(trainX.shape, trainY.shape) clf = xgb.XGBRegressor(seed=12) if one_hot: # ONE HOT with norm PARAMS sqare grid = [ { 'booster': ['gbtree'], 'learning_rate': [0.1], # 'min_child_weight':[], 'max_depth': [2], 'gamma': [1], 'subsample': [0.3], 'colsample_bytree': [0.3], 'reg_alpha': [1.0], 'reg_lambda': [0.85], 'scale_pos_weight': [1] }, ] else: # no one hot PARAMS sqrt # grid = [{ # 'booster': ['gbtree'], # 'learning_rate': [0.1], # # 'min_child_weight':[], # 'max_depth': [2], # 'gamma': [0.7], # 'subsample': [0.1], # 'colsample_bytree': [0.3], # 'reg_alpha': [0.5], # 'reg_lambda': [0.3], # 'scale_pos_weight': [1] # }, # ] # no one hot PARAMS log1p grid = [ { 'booster': ['gbtree'], 'learning_rate': [0.25], # 'min_child_weight':[], 'max_depth': [2], 'gamma': [0.09], 'subsample': [0.1], 'colsample_bytree': [0.95], 'reg_alpha': [0.5], 'reg_lambda': [0.25], 'scale_pos_weight': [1] }, ] gridCV = GridSearchCV(estimator=clf, param_grid=grid, scoring=make_scorer(_scorer, greater_is_better=False), iid=False, n_jobs=-1, cv=6, verbose=1) gridCV.fit(trainX, trainY) print("best params:", gridCV.best_params_) print('best score:', gridCV.best_score_) testX = scaler.transform(testX) predY = np.reshape(gridCV.predict(testX), (-1, 1)) preprocessed_data.loc[test_index, y_column] = trs.inverse_transform(predY) return preprocessed_data
You are working with numeric data that needs imputation, and text data that needs to be converted into a bag-of-words. You'll create functions that separate the text from the numeric variables and see how the .fit() and .transform() methods work. INSTRUCTIONS 100XP Compute the selector get_text_data by using a lambda function and FunctionTransformer() to obtain all 'text' columns. Compute the selector get_numeric_data by using a lambda function and FunctionTransformer() to obtain all the numeric columns (including missing data). These are 'numeric' and 'with_missing'. Fit and transform get_text_data using the .fit_transform() method with sample_df as the argument. Fit and transform get_numeric_data using the same approach as above. ''' # Import FunctionTransformer from sklearn.preprocessing import FunctionTransformer # Obtain the text data: get_text_data get_text_data = FunctionTransformer(lambda x: x['text'], validate=False) # Obtain the numeric data: get_numeric_data get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False) # Fit and transform the text data: just_text_data just_text_data = get_text_data.fit_transform(sample_df) # Fit and transform the numeric data: just_numeric_data just_numeric_data = get_numeric_data.fit_transform(sample_df) # Print head to check results print('Text Data') print(just_text_data.head()) print('\nNumeric Data') print(just_numeric_data.head())
from sklearn.preprocessing import FunctionTransformer from sklearn.pipeline import FeatureUnion # FeatureUnion is useful to combine pipelines! ################################################### # Import FunctionTransformer from sklearn.preprocessing import FunctionTransformer # Obtain the text data: get_text_data get_text_data = FunctionTransformer(lambda x: x['text'], validate=False) # Obtain the numeric data: get_numeric_data get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False) # Fit and transform the text data: just_text_data just_text_data = get_text_data.fit_transform(sample_df) # Fit and transform the numeric data: just_numeric_data just_numeric_data = get_numeric_data.fit_transform(sample_df) # # Print head to check results # print('Text Data') # print(just_text_data.head()) # print('\nNumeric Data') # print(just_numeric_data.head()) # Import FeatureUnion from sklearn.pipeline import FeatureUnion # Split using ALL data in sample_df X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']], pd.get_dummies(sample_df['label']),
class MSD(IndexableDataset): """Assuming input datastream is a example of user-item interaction triplet. In this class mel-spectrogram and tag vector (BOW) is fetched based on the triplet's item index """ provides_sources = ('raw') def __init__(self, target, which_set, config, *args, **kwargs): """ """ self.source = 'raw' self.axis_labels = None self.sr = config.hyper_parameters.sample_rate self.length = config.hyper_parameters.patch_length self.slice_dur = int(self.length * self.sr) self.sub_batch_sz = config.hyper_parameters.sub_batch_size self.n_fft = config.hyper_parameters.n_fft self.hop_length = config.hyper_parameters.hop_size self.output_norm = config.data_server.output_norm self.target = target self.which_set = which_set self.n_jobs = config.data_server.n_jobs self.config = config # load dataset into instance self._load() def _load(self): if hasattr(self.config.paths.meta_data.splits, self.target): split_fn = eval('self.config.paths.meta_data.splits.{}'.format( self.target)) split_fn = os.path.join(self.config.paths.meta_data.root, split_fn) self.internal_idx = joblib.load(split_fn)[self.which_set] else: raise IOError('[ERROR] cannot load split file!') if hasattr(self.config.paths.meta_data.targets, self.target): target_fn = eval('self.config.paths.meta_data.targets.{}'.format( self.target)) target_fn = os.path.join(self.config.paths.meta_data.root, target_fn) target = joblib.load(target_fn) target_ref = {v: k for k, v in enumerate(target['tids'])} self.Y = target['item_factors'] # output standardization if self.output_norm: self.out_sclr = StandardScaler() else: self.out_sclr = FunctionTransformer(func=lambda x: x) self.Y = self.out_sclr.fit_transform(self.Y) else: self.Y = None path_to_pathmap = self.config.paths.path_map if (path_to_pathmap is not None) and os.path.exists(path_to_pathmap): self._path_map = pkl.load(open(path_to_pathmap)) # filter out error entries (no data) incl = filter(lambda t: t in self._path_map, self.internal_idx) self.Y = self.Y[map(lambda x: target_ref[x], incl)] self.internal_idx = map(lambda x: x, incl) if self.Y.shape[0] != len(self.internal_idx): raise ValueError('length bet. index and targets are not\ consistant!') @property def num_examples(self): """ """ return len(self.internal_idx) def _multi_load(self, fns): """""" return pmap(partial(load_audio, sr=self.sr), fns, n_jobs=self.n_jobs) def _convert_index(self, request): """""" return map( lambda x: os.path.join(self.config.paths.audio.root, self. _path_map[self.internal_idx[x]]), request) def get_data(self, state=None, request=None): if state is not None: raise ValueError # (batch,2,sr*length) try: batch_sz = len(request) if self.target != 'self': # convert index request_fn = self._convert_index(request) # list of (2, 128, len) signal = self._multi_load(request_fn) signal, mask = zero_pad_signals(signal) # fetch target target = map(lambda ix: self.Y[ix], request) data = filter(lambda y: y[1].sum() > self.slice_dur, zip(signal, mask, target)) X = map(lambda x: x[0], data) M = map(lambda x: x[1], data) Y = map(lambda x: x[2], data) # prepare sub batch X, Y = prepare_sub_batches(self.sub_batch_sz, self.slice_dur, X, M, Y) else: # get index list triplet = sample_matcher_idx(request, self.internal_idx) # make hash for batch elements uniq_idx = list( set( list( chain.from_iterable( map(lambda x: (x[0], x[1]), triplet))))) uniq_hash = {v: k for k, v in enumerate(uniq_idx)} # convert index into path uniq_paths = self._convert_index(uniq_idx) # list of (2, 128, len) signal = self._multi_load(uniq_paths) signal, mask = zero_pad_signals(signal) # list of (128,n_frames) data = filter(lambda x: x[1].sum() > self.slice_dur, zip(signal, mask, uniq_idx)) survivors = set(map(lambda x: x[2], data)) data = {d[2]: (d[0], d[1]) for d in data} # assign databatch into original order Xl, Xr, Ml, Mr, Y = [], [], [], [], [] for d in triplet: if (d[0] not in survivors) or (d[1] not in survivors): continue else: Xl.append(data[d[0]][0]) Xr.append(data[d[1]][0]) Ml.append(data[d[0]][1]) Mr.append(data[d[1]][1]) Y.append(d[2]) # prepare sub batch Xl, Y = prepare_sub_batches(self.sub_batch_sz, self.slice_dur, Xl, Ml, Y) Xr, _ = prepare_sub_batches(self.sub_batch_sz, self.slice_dur, Xr, Mr) X = np.swapaxes(np.array([Xl, Xr]), 0, 1) y = np.eye(2) Y = y[Y.ravel().astype(int).tolist()] print(X.shape, Y.shape) except Exception as e: traceback.print_exc() # raise Exception return -1, -1, request else: return X, Y, request
def test_function_transformer_frame(): pd = pytest.importorskip('pandas') X_df = pd.DataFrame(np.random.randn(100, 10)) transformer = FunctionTransformer(validate=False) X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, 'loc')