def test_pandas_minmax_scaling(): s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6))) df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 df_out1 = minmax_scaling(df, ['s1', 's2'], min_val=0, max_val=1) df_out2 = minmax_scaling(df, ['s1', 's2'], min_val=50, max_val=100) ary_out1 = np.array([[0.0, 1.0], [0.2, 0.8], [0.4, 0.6], [0.6, 0.4], [0.8, 0.2], [1.0, 0.0]]) ary_out2 = np.array([[50.0, 100.0], [60.0, 90.0], [70.0, 80.0], [80.0, 70.0], [90.0, 60.0], [100.0, 50.0]]) np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03) assert((df_out2.values == ary_out2).all())
def test_minmax_scaling_arrayerror(): try: ary = [[1, 2], [3, 4]] minmax_scaling(ary, [1, 's2']) except AttributeError: pass else: raise AssertionError
def test_numpy_minmax_scaling(): ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) df_out1 = minmax_scaling(ary, [0, 1], min_val=0, max_val=1) df_out2 = minmax_scaling(ary, [0, 1], min_val=50, max_val=100) ary_out1 = np.array([[0.0, 1.0], [0.2, 0.8], [0.4, 0.6], [0.6, 0.4], [0.8, 0.2], [1.0, 0.0]]) ary_out2 = np.array([[50.0, 100.0], [60.0, 90.0], [70.0, 80.0], [80.0, 70.0], [90.0, 60.0], [100.0, 50.0]]) np.testing.assert_allclose(df_out1, ary_out1, rtol=1e-03) assert ((df_out2 == ary_out2).all())
def ranking3(df1, df2): df3 = df1 + (df1 - 0.5) * df2 game_list = df3.columns.values.tolist() df3 = minmax_scaling(df3, columns=game_list) df3 = df3.apply(change) df3 = df3.apply(np.ceil) return df3
def ranking2(df): game_list = df.columns.values.tolist() df = df.apply(np.log10) df = minmax_scaling(df, columns=game_list) df = df.apply(change) df = df.apply(np.ceil) return df
def read_bike_sharing_testing_data(): dataset = pd.read_csv('../data/bikeSharing.shuf.test.csv') #we drop dteday and id beacuse we can't predict any thing from date or from ID #dataset = dataset.drop(['dteday', 'id'], axis=1) dataset = dataset.drop(['dteday'], axis=1) #X = dataset.drop(['cnt'], axis=1) # axis = 1 -> columns, whereas axis=0 index # Y = dataset['cnt'] df_scaled = minmax_scaling(dataset, columns=list(dataset.columns)) df_scaled.to_csv('read_bike_sharing_test__scaled.csv') return df_scaled
def test_numpy_minmax_scaling(): ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) df_out1 = minmax_scaling(ary, [0, 1], min_val=0, max_val=1) df_out2 = minmax_scaling(ary, [0, 1], min_val=50, max_val=100) ary_out1 = np.array([[0.0, 1.0], [0.2, 0.8], [0.4, 0.6], [0.6, 0.4], [0.8, 0.2], [1.0, 0.0]]) ary_out2 = np.array([[50.0, 100.0], [60.0, 90.0], [70.0, 80.0], [80.0, 70.0], [90.0, 60.0], [100.0, 50.0]]) np.testing.assert_allclose(df_out1, ary_out1, rtol=1e-03) assert((df_out2 == ary_out2).all())
def simple_scale(dataframe, Col_start=0, Col_stop=None): # scale the data between 0 and 1 # only worked in python3 # return dataset from mlxtend.preprocessing import minmax_scaling dataset = dataframe.values.copy() Col_stop_ = len(dataframe.columns) if Col_stop is not None: Col_stop_ = Col_stop for nCol in range(Col_start, Col_stop_): dataset[:, nCol] = minmax_scaling(dataset, columns=[nCol])[:, 0] return dataset
def simple_scale_dataset(dataset, Col_start=0, Col_stop=None): # scale the data between 0 and 1 # only worked in python3 # return dataset from mlxtend.preprocessing import minmax_scaling Col_stop_ = dataset.shape[1] if Col_stop is not None: Col_stop_ = Col_stop for nCol in range(Col_start, Col_stop_): dataset[:, nCol] = minmax_scaling(dataset, columns=[nCol])[:, 0] print('simple scale data to [0,1]') print('Mean:' + str(dataset.mean(axis=0))) print('Std:' + str(dataset.std(axis=0))) return dataset
# scaling # you're changing the range of your data # normalization # you're changing the shape of the distribution of your data ### Scaling # You want to scale data when you're using methods based on measures of how far apart data points are, # like support vector machines (SVM) or k-nearest neighbors (KNN). # With these algorithms, a change of "1" in any numeric feature is given the same importance. # generate 1000 data points randomly drawn from an exponential distribution original_data = np.random.exponential(size=1000) # mix-max scale the data between 0 and 1 scaled_data = minmax_scaling(original_data, columns=[0]) # plot both together to compare fig, ax = plt.subplots(1, 2) sns.histplot(original_data, ax=ax[0]) ax[0].set_title("Original Data") sns.histplot(scaled_data, ax=ax[1]) ax[1].set_title("Scaled data") ### Normalization # normalize the exponential data with boxcox normalized_data = stats.boxcox(original_data) # plot both together to compare fig, ax = plt.subplots(1, 2)
if (feature_type == 'spec'): features_train = spec_features_train features_val = spec_features_val elif (feature_type == 'mfcc'): features_train = mfcc_features_train features_val = mfcc_features_val train_classes = os.listdir(features_train) for p_label in train_classes: files = os.listdir(os.path.join(features_train, p_label)) for f in files: feature = np.load(os.path.join(features_train, p_label, f)) feature = feature.reshape(-1) minmax_scaling(feature, columns=[0]) feature = np.nan_to_num(feature) X_train.append(feature) Y_train.append(p_label) X_train = np.asarray(X_train) Y_train = np.asarray(Y_train) val_classes = os.listdir(features_val) for p_label in val_classes: files = os.listdir(os.path.join(features_val, p_label)) for f in files: feature = np.load(os.path.join(features_val, p_label, f)).reshape(-1) minmax_scaling(feature, columns=[0]) feature = np.nan_to_num(feature)
import seaborn as sns import matplotlib.pyplot as plt # Read in data kickstarters_2017 = pd.read_csv("data/ks-projects-201801.csv") # Set seed np.random.seed(101) ########## # Made-up example on scaling ########## # Pull 1000 data points from exponential distribution original_data = np.random.exponential(size=1000) scaled_data = minmax_scaling(original_data, columns=[0]) # min-max scaling # Plot for comparison fig, ax = plt.subplots(1, 2) sns.distplot(original_data, ax=ax[0]) ax[0].set_title('Original Data') sns.distplot(scaled_data, ax=ax[1]) ax[1].set_title('Scaled Data') # plt.show() ########## # Made-up example on normalizing ########## normalized_data = stats.boxcox( original_data) #Box-Cox Transformation to normalize fig, ax = plt.subplots(1, 2)
def __init__(self, data): self.original_data = np.array(data, dtype=np.float) self.scaled_data = minmax_scaling(data, columns=[0]) self.normalized_data = stats.boxcox(data)
st.sidebar.title('Select Patient:') a = st.sidebar.number_input(label='Enter a value upto 10:', min_value=0, value=0, step=1) name = df.iloc[a, 0] st.sidebar.write(name) df = pd.DataFrame(df.drop(columns=['Name'])) df = minmax_scaling(df, columns=[ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ]) model = tf.keras.models.load_model('diabetes.h5') submit = st.button('Predict') if submit: prediction = model.predict_classes(df) prediction = np.around(prediction[a]) if prediction == 0: st.success('Congratulation! Patient is not diabetic') else: st.write('Sorry! Patient is Diabetic.')
def scale_data(self): col_names = list(self.train.columns) predictor_names = [x for x in col_names if "Outcome" not in x] self.train[predictor_names] = minmax_scaling(self.train, columns=predictor_names)
import pandas as pd import numpy as np s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 4], index=(range(6))) # Scaling a pandas dataframe df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 print df from mlxtend.preprocessing import minmax_scaling print minmax_scaling(df, columns=['s1', 's2']) # Scaling a numpy array X = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) print minmax_scaling(X, columns=[0, 1])
plt.plot(np.arange(1000), sum, 'ro') ################ nooutliers_linear = np.arange(1000) sum_linear = np.copy(nooutliers_linear) for out in outliers: cord = int(1000 * np.random.random_sample((1, ))) sum_linear[cord] = out plt.figure(figsize=(20, 10)) plt.title("linear data with outliers before scaling") plt.plot(np.arange(1000), sum_linear, 'ro') result_minmax = minmax_scaling( sum, columns=[0], min_val=0, max_val=1) # problem when we set 0 at min doesnt reach max result_minmax_linear = minmax_scaling( sum_linear, columns=[0], min_val=0, max_val=1) # problem when we set 0 at min doesnt reach max ####### # visualize scaler MinMax plt.figure(figsize=(20, 10)) plt.title("constant data after min max scaling") plt.plot(np.arange(1000), result_minmax, 'ro') plt.figure(figsize=(20, 10)) plt.title("linear data after min max scaling") plt.plot(np.arange(1000), result_minmax_linear, 'ro')
# plotting modules import seaborn as sns import matplotlib.pyplot as plt # set seed for reproducibility np.random.seed(0) #in scaling, you're changing the range of your data, while #in normalization, you're changing the shape of the distribution of your data. # generate 1000 data points randomly drawn from an exponential distribution original_data = np.random.exponential(size=1000) # mix-max scale the data between 0 and 1 scaled_data = minmax_scaling(original_data, columns=[0]) # plot both together to compare fig, ax = plt.subplots(1, 2) sns.distplot(original_data, ax=ax[0]) ax[0].set_title("Original Data") sns.distplot(scaled_data, ax=ax[1]) ax[1].set_title("Scaled data") # normalize the exponential data with boxcox normalized_data = stats.boxcox(original_data) # plot both together to compare fig, ax = plt.subplots(1, 2) sns.distplot(original_data, ax=ax[0]) ax[0].set_title("Original Data")
# drop 'class' of target column and redefine to start 0 of class train_df['target'] = train_df['target'].str.slice(start=6).astype(int)-1 train_y= train_df['target'] # define x train_x = train_df.drop(columns = ['id', 'target']) test_x = test_df.drop(columns = ['id']) # Split train and val from sklearn.model_selection import train_test_split train_df_x, val_df_x, train_df_y, val_df_y = train_test_split(train_x, train_y, test_size=0.5, random_state=123) # Scaling and Split from mlxtend.preprocessing import minmax_scaling train_x_scaled = minmax_scaling(train_x, columns=train_x.columns) train_df_x_scaled, val_df_s_scaled, train_df_y_scaled, val_df_y_scaled = train_test_split(train_x_scaled, train_y, test_size = 0.5, random_state=123) test_x_scaled = minmax_scaling(test_x, columns=test_x.columns) # Features train_x_feature = train_x.copy() train_y_feature = train_y.copy() for colname in train_x_feature.select_dtypes('object'): train_x_feature[colname], _ = train_x_feature[colname].factorize() # MI method must have int type discrete_features = train_x_feature.dtypes == int from sklearn.feature_selection import mutual_info_classif
# Creating DataFrame first s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6))) df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 df # In[72]: # Use Scikit-Learn minmax_scaling from mlxtend.preprocessing import minmax_scaling minmax_scaling(df, columns=['s1', 's2']) # In[73]: # Now let's build a pipeline for preprocessing the numerical attributes: from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ])
from mlxtend.preprocessing import minmax_scaling import seaborn as sns import matplotlib.pyplot as plt print("Setup Complete") kickstarters_2017 = pd.read_csv( "../input/kickstarter-projects/ks-projects-201801.csv") np.random.seed(0) ## 1) Practice scaling ## We just scaled the "usd_goal_real" column. What about the "goal" column? ## Begin by running the code cell below to create a DataFrame original_goal_data containing the "goal" column. original_goal_data = pd.DataFrame(kickstarters_2017.goal) ## Answer: scaled_goal_data = minmax_scaling(original_goal_data, columns=['goal']) ## 2) Practice normalization ## Now you'll practice normalization. We begin by normalizing the amount of money pledged to each campaign. index_of_positive_pledges = kickstarters_2017.usd_pledged_real > 0 positive_pledges = kickstarters_2017.usd_pledged_real.loc[ index_of_positive_pledges] normalized_pledges = pd.Series(stats.boxcox(positive_pledges)[0], name='usd_pledged_real', index=positive_pledges.index) fig, ax = plt.subplots(1, 2, figsize=(15, 3)) sns.distplot(positive_pledges, ax=ax[0]) ax[0].set_title("Original Data") sns.distplot(normalized_pledges, ax=ax[1]) ax[1].set_title("Normalized data") print('Original data\nPreview:\n', positive_pledges.head())
kickstarters_2017 = pd.read_csv( "D:\\work\\python\\DataClean\\ks-projects-201801.csv") # set seed for reproducibility np.random.seed(0) # START : Scaling - We change the Range of the Data without changing the shape of distribution # This means that you're transforming your data so that it fits within a specific scale, like 0-100 or 0-1. # http://scikit-learn.org/stable/modules/preprocessing.html # or https://stats.stackexchange.com/questions/41704/how-and-why-do-normalization-and-feature-scaling-work/254815#254815 # select the usd_goal_real column usd_goal = kickstarters_2017.usd_goal_real print(usd_goal.head()) # scale the goals from 0 to 1 scaled_data = minmax_scaling(usd_goal, columns=[0]) print(scaled_data) # plot the original & scaled data together to compare fig, ax = plt.subplots(1, 2) sns.distplot(kickstarters_2017.usd_goal_real, ax=ax[0]) ax[0].set_title("Original Data") sns.distplot(scaled_data, ax=ax[1]) ax[1].set_title("Scaled data") plt.show(block=False) # END : Scaling s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6))) df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 print(df)
# Select numeric columns numeric_cols = [ column for column in X.columns if X[column].dtype in ['int64', 'float64'] ] # Keep selected columns only cols = low_cardinality_cols + numeric_cols X = X[cols] # One-hot encode the data (to shorten the code, we use pandas) X = pd.get_dummies(X) X.head() # Scale the data from -1 to 1 minmax_scaling(X, columns=X.columns, min_val=-1, max_val=1) X.head() # Break off validation set from training data X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) # Define the model model = XGBClassifier(random_state=0, disable_default_eval_metric=True, use_label_encoder=False) # Fit the model
import matplotlib.pyplot as plt kickstarters_2017=pd.read_csv("/home/pliu/Downloads/python_data_cleaning/ks-projects-201801.csv") np.random.seed(0) ####################################################################### ###########Scalling data ############################################# ##################################################################### # generate 1000 data points randomly drawn from an exponential distribution original_data = np.random.exponential(size = 1000) #print(original_data[1],original_data[2]) # mix-max scale the data between 0 and 1, it does not change the data distribution shape scaled_data = minmax_scaling(original_data, columns = [0]) #print(scaled_data[1],original_data[2]) # plot both together to compare fig, ax=plt.subplots(1,2) sns.distplot(original_data, ax=ax[0]) ax[0].set_title("Original Data") sns.distplot(scaled_data, ax=ax[1]) ax[1].set_title("Scaled data") ####################################################################### ###########Normalize data ############################################# ##################################################################### # normalize the exponential data with boxcox
# MinMaxScaler ## X_norm = (X - X_min) / (X_max - X_min) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() wine_minmax_scaled = scaler.fit_transform(wine_subset) np.var(wine_minmax_scaled) # Using mlxtend version of MinMaxScaler from mlxtend.preprocessing import minmax_scaling minmax_scaling(wine, columns=['ash', 'alcalinity', 'magnesium']) ########### Repeat KNN *with* scaling ################# from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsRegressor scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y) knn = KNeighborsRegressor() knn.fit(X_train, y_train) knn.score(X_test, y_test)
df6 = pd.DataFrame(INTJ, columns=['INTJ']) df7 = pd.DataFrame(INTP, columns=['INTP']) df8 = pd.DataFrame(INFP, columns=['INFP']) df9 = pd.DataFrame(ENFP, columns=['ENFP']) df10 = pd.DataFrame(ESFP, columns=['ESFP']) df11 = pd.DataFrame(ENTJ, columns=['ENTJ']) df12 = pd.DataFrame(ENTP, columns=['ENTP']) df13 = pd.DataFrame(ENFJ, columns=['ENFJ']) df14 = pd.DataFrame(ESFJ, columns=['ESFJ']) df15 = pd.DataFrame(ESTJ, columns=['ESTJ']) df16 = pd.DataFrame(ESTP, columns=['ESTP']) print(minmax_scaling(df1, columns=['ISTP'])) print(minmax_scaling(df2, columns=['INFJ'])) print(minmax_scaling(df3, columns=['ISTJ'])) print(minmax_scaling(df4, columns=['ISFP'])) print(minmax_scaling(df5, columns=['ISFJ'])) print(minmax_scaling(df6, columns=['INTJ'])) print(minmax_scaling(df7, columns=['INTP'])) print(minmax_scaling(df8, columns=['INFP'])) print(minmax_scaling(df9, columns=['ENFP'])) print(minmax_scaling(df10, columns=['ESFP'])) print(minmax_scaling(df11, columns=['ENTJ'])) print(minmax_scaling(df12, columns=['ENTP'])) print(minmax_scaling(df13, columns=['ENFJ'])) print(minmax_scaling(df14, columns=['ESFJ'])) print(minmax_scaling(df15, columns=['ESTJ']))
from mlxtend.preprocessing import minmax_scaling get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: #Import Dataset df = pd.read_csv( 'C:/Users/dell/Downloads/phishing-website-dataset/dataset.csv') df.head() # In[3]: # select the IP_address column goal = df.having_IPhaving_IP_Address scaled_data = minmax_scaling(goal, columns=[0]) # plot the original & scaled data together to compare fig, ax = plt.subplots(1, 2) sns.distplot(df.having_IPhaving_IP_Address, ax=ax[0]) ax[0].set_title("Original Data") sns.distplot(scaled_data, ax=ax[1]) ax[1].set_title("Scaled data") # In[4]: a = len(df[df.Result == 0]) b = len(df[df.Result == -1]) c = len(df[df.Result == 1]) print(a, "times 0 repeated in Result") print(b, "times -1 repeated in Result")