コード例 #1
0
def test_pandas_minmax_scaling():
    s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6)))
    s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6)))
    df = pd.DataFrame(s1, columns=['s1'])
    df['s2'] = s2

    df_out1 = minmax_scaling(df, ['s1', 's2'], min_val=0, max_val=1)
    df_out2 = minmax_scaling(df, ['s1', 's2'], min_val=50, max_val=100)

    ary_out1 = np.array([[0.0, 1.0],
                        [0.2, 0.8],
                        [0.4, 0.6],
                        [0.6, 0.4],
                        [0.8, 0.2],
                        [1.0, 0.0]])

    ary_out2 = np.array([[50.0, 100.0],
                        [60.0, 90.0],
                        [70.0, 80.0],
                        [80.0, 70.0],
                        [90.0, 60.0],
                        [100.0, 50.0]])

    np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03)
    assert((df_out2.values == ary_out2).all())
コード例 #2
0
def test_pandas_minmax_scaling():
    s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6)))
    s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6)))
    df = pd.DataFrame(s1, columns=['s1'])
    df['s2'] = s2

    df_out1 = minmax_scaling(df, ['s1', 's2'], min_val=0, max_val=1)
    df_out2 = minmax_scaling(df, ['s1', 's2'], min_val=50, max_val=100)

    ary_out1 = np.array([[0.0, 1.0],
                        [0.2, 0.8],
                        [0.4, 0.6],
                        [0.6, 0.4],
                        [0.8, 0.2],
                        [1.0, 0.0]])

    ary_out2 = np.array([[50.0, 100.0],
                        [60.0, 90.0],
                        [70.0, 80.0],
                        [80.0, 70.0],
                        [90.0, 60.0],
                        [100.0, 50.0]])

    np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03)
    assert((df_out2.values == ary_out2).all())
コード例 #3
0
def test_minmax_scaling_arrayerror():
    try:
        ary = [[1, 2], [3, 4]]
        minmax_scaling(ary, [1, 's2'])
    except AttributeError:
        pass
    else:
        raise AssertionError
コード例 #4
0
def test_minmax_scaling_arrayerror():
    try:
        ary = [[1, 2], [3, 4]]
        minmax_scaling(ary, [1, 's2'])
    except AttributeError:
        pass
    else:
        raise AssertionError
def test_numpy_minmax_scaling():
    ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]])

    df_out1 = minmax_scaling(ary, [0, 1], min_val=0, max_val=1)
    df_out2 = minmax_scaling(ary, [0, 1], min_val=50, max_val=100)

    ary_out1 = np.array([[0.0, 1.0], [0.2, 0.8], [0.4, 0.6], [0.6, 0.4],
                         [0.8, 0.2], [1.0, 0.0]])

    ary_out2 = np.array([[50.0, 100.0], [60.0, 90.0], [70.0, 80.0],
                         [80.0, 70.0], [90.0, 60.0], [100.0, 50.0]])

    np.testing.assert_allclose(df_out1, ary_out1, rtol=1e-03)
    assert ((df_out2 == ary_out2).all())
コード例 #6
0
def ranking3(df1, df2):
    df3 = df1 + (df1 - 0.5) * df2
    game_list = df3.columns.values.tolist()
    df3 = minmax_scaling(df3, columns=game_list)
    df3 = df3.apply(change)
    df3 = df3.apply(np.ceil)
    return df3
コード例 #7
0
def ranking2(df):
    game_list = df.columns.values.tolist()
    df = df.apply(np.log10)
    df = minmax_scaling(df, columns=game_list)
    df = df.apply(change)
    df = df.apply(np.ceil)
    return df
コード例 #8
0
def read_bike_sharing_testing_data():
    dataset = pd.read_csv('../data/bikeSharing.shuf.test.csv')
    #we drop dteday and id beacuse we can't predict any thing from date or from ID
    #dataset = dataset.drop(['dteday', 'id'], axis=1)
    dataset = dataset.drop(['dteday'], axis=1)
    #X = dataset.drop(['cnt'], axis=1) # axis = 1 -> columns, whereas axis=0 index
    # Y = dataset['cnt']
    df_scaled = minmax_scaling(dataset, columns=list(dataset.columns))
    df_scaled.to_csv('read_bike_sharing_test__scaled.csv')
    return df_scaled
コード例 #9
0
def test_numpy_minmax_scaling():
    ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]])

    df_out1 = minmax_scaling(ary, [0, 1], min_val=0, max_val=1)
    df_out2 = minmax_scaling(ary, [0, 1], min_val=50, max_val=100)

    ary_out1 = np.array([[0.0, 1.0],
                        [0.2, 0.8],
                        [0.4, 0.6],
                        [0.6, 0.4],
                        [0.8, 0.2],
                        [1.0, 0.0]])

    ary_out2 = np.array([[50.0, 100.0],
                        [60.0, 90.0],
                        [70.0, 80.0],
                        [80.0, 70.0],
                        [90.0, 60.0],
                        [100.0, 50.0]])

    np.testing.assert_allclose(df_out1, ary_out1, rtol=1e-03)
    assert((df_out2 == ary_out2).all())
コード例 #10
0
def simple_scale(dataframe, Col_start=0, Col_stop=None):
    # scale the data between 0 and 1
    # only worked in python3
    # return dataset
    from mlxtend.preprocessing import minmax_scaling
    dataset = dataframe.values.copy()

    Col_stop_ = len(dataframe.columns)
    if Col_stop is not None: Col_stop_ = Col_stop

    for nCol in range(Col_start, Col_stop_):
        dataset[:, nCol] = minmax_scaling(dataset, columns=[nCol])[:, 0]

    return dataset
コード例 #11
0
def simple_scale_dataset(dataset, Col_start=0, Col_stop=None):
    # scale the data between 0 and 1
    # only worked in python3
    # return dataset
    from mlxtend.preprocessing import minmax_scaling

    Col_stop_ = dataset.shape[1]
    if Col_stop is not None: Col_stop_ = Col_stop

    for nCol in range(Col_start, Col_stop_):
        dataset[:, nCol] = minmax_scaling(dataset, columns=[nCol])[:, 0]

    print('simple scale data to [0,1]')
    print('Mean:' + str(dataset.mean(axis=0)))
    print('Std:' + str(dataset.std(axis=0)))

    return dataset
コード例 #12
0
# scaling
# you're changing the range of your data

# normalization
# you're changing the shape of the distribution of your data

### Scaling
# You want to scale data when you're using methods based on measures of how far apart data points are,
# like support vector machines (SVM) or k-nearest neighbors (KNN).
# With these algorithms, a change of "1" in any numeric feature is given the same importance.

# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size=1000)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# plot both together to compare
fig, ax = plt.subplots(1, 2)
sns.histplot(original_data, ax=ax[0])
ax[0].set_title("Original Data")
sns.histplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")

### Normalization

# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both together to compare
fig, ax = plt.subplots(1, 2)
コード例 #13
0
if (feature_type == 'spec'):
    features_train = spec_features_train
    features_val = spec_features_val
elif (feature_type == 'mfcc'):
    features_train = mfcc_features_train
    features_val = mfcc_features_val

train_classes = os.listdir(features_train)

for p_label in train_classes:
    files = os.listdir(os.path.join(features_train, p_label))
    for f in files:
        feature = np.load(os.path.join(features_train, p_label, f))
        feature = feature.reshape(-1)
        minmax_scaling(feature, columns=[0])
        feature = np.nan_to_num(feature)
        X_train.append(feature)
        Y_train.append(p_label)

X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)

val_classes = os.listdir(features_val)

for p_label in val_classes:
    files = os.listdir(os.path.join(features_val, p_label))
    for f in files:
        feature = np.load(os.path.join(features_val, p_label, f)).reshape(-1)
        minmax_scaling(feature, columns=[0])
        feature = np.nan_to_num(feature)
コード例 #14
0
ファイル: day_02.py プロジェクト: JonKPowers/kaggle_projects
import seaborn as sns
import matplotlib.pyplot as plt

# Read in data
kickstarters_2017 = pd.read_csv("data/ks-projects-201801.csv")

# Set seed
np.random.seed(101)

##########
# Made-up example on scaling
##########
# Pull 1000 data points from exponential distribution
original_data = np.random.exponential(size=1000)
scaled_data = minmax_scaling(original_data, columns=[0])  # min-max scaling
# Plot for comparison
fig, ax = plt.subplots(1, 2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title('Original Data')
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title('Scaled Data')
# plt.show()

##########
# Made-up example on normalizing
##########
normalized_data = stats.boxcox(
    original_data)  #Box-Cox Transformation to normalize

fig, ax = plt.subplots(1, 2)
コード例 #15
0
 def __init__(self, data):
     self.original_data = np.array(data, dtype=np.float)
     self.scaled_data = minmax_scaling(data, columns=[0])
     self.normalized_data = stats.boxcox(data)
コード例 #16
0
st.sidebar.title('Select Patient:')

a = st.sidebar.number_input(label='Enter a value upto 10:',
                            min_value=0,
                            value=0,
                            step=1)

name = df.iloc[a, 0]
st.sidebar.write(name)

df = pd.DataFrame(df.drop(columns=['Name']))

df = minmax_scaling(df,
                    columns=[
                        'Pregnancies', 'Glucose', 'BloodPressure',
                        'SkinThickness', 'Insulin', 'BMI',
                        'DiabetesPedigreeFunction', 'Age'
                    ])

model = tf.keras.models.load_model('diabetes.h5')

submit = st.button('Predict')

if submit:
    prediction = model.predict_classes(df)
    prediction = np.around(prediction[a])
    if prediction == 0:
        st.success('Congratulation! Patient is not diabetic')
    else:
        st.write('Sorry! Patient is Diabetic.')
コード例 #17
0
    def scale_data(self):

        col_names = list(self.train.columns)
        predictor_names = [x for x in col_names if "Outcome" not in x]
        self.train[predictor_names] = minmax_scaling(self.train,
                                                     columns=predictor_names)
コード例 #18
0
ファイル: minmax.py プロジェクト: clover9gu/simplemining
import pandas as pd
import numpy as np

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6)))
s2 = pd.Series([10, 9, 8, 7, 6, 4], index=(range(6)))

# Scaling a pandas dataframe
df = pd.DataFrame(s1, columns=['s1'])
df['s2'] = s2
print df

from mlxtend.preprocessing import minmax_scaling
print minmax_scaling(df, columns=['s1', 's2'])

# Scaling a numpy array
X = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]])
print minmax_scaling(X, columns=[0, 1])

コード例 #19
0
plt.plot(np.arange(1000), sum, 'ro')

################

nooutliers_linear = np.arange(1000)
sum_linear = np.copy(nooutliers_linear)
for out in outliers:
    cord = int(1000 * np.random.random_sample((1, )))
    sum_linear[cord] = out

plt.figure(figsize=(20, 10))
plt.title("linear data with outliers before scaling")
plt.plot(np.arange(1000), sum_linear, 'ro')

result_minmax = minmax_scaling(
    sum, columns=[0], min_val=0,
    max_val=1)  # problem when we set 0 at min doesnt reach max
result_minmax_linear = minmax_scaling(
    sum_linear, columns=[0], min_val=0,
    max_val=1)  # problem when we set 0 at min doesnt reach max

#######

# visualize scaler MinMax
plt.figure(figsize=(20, 10))
plt.title("constant data after min max scaling")
plt.plot(np.arange(1000), result_minmax, 'ro')

plt.figure(figsize=(20, 10))
plt.title("linear data after min max scaling")
plt.plot(np.arange(1000), result_minmax_linear, 'ro')
コード例 #20
0
# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

# set seed for reproducibility
np.random.seed(0)

#in scaling, you're changing the range of your data, while
#in normalization, you're changing the shape of the distribution of your data.

# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size=1000)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# plot both together to compare
fig, ax = plt.subplots(1, 2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")

# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both together to compare
fig, ax = plt.subplots(1, 2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original Data")
コード例 #21
0
# drop 'class' of target column and redefine to start 0 of class
train_df['target'] = train_df['target'].str.slice(start=6).astype(int)-1
train_y= train_df['target']

# define x
train_x = train_df.drop(columns = ['id', 'target'])
test_x = test_df.drop(columns = ['id'])

# Split train and val
from sklearn.model_selection import train_test_split
train_df_x, val_df_x, train_df_y, val_df_y = train_test_split(train_x, train_y, test_size=0.5, random_state=123)

# Scaling and Split
from mlxtend.preprocessing import minmax_scaling
train_x_scaled = minmax_scaling(train_x, columns=train_x.columns)
train_df_x_scaled, val_df_s_scaled, train_df_y_scaled, val_df_y_scaled = train_test_split(train_x_scaled, train_y, test_size = 0.5, random_state=123)

test_x_scaled = minmax_scaling(test_x, columns=test_x.columns)

# Features

train_x_feature = train_x.copy()
train_y_feature = train_y.copy()
for colname in train_x_feature.select_dtypes('object'):
    train_x_feature[colname], _ = train_x_feature[colname].factorize()

# MI method must have int type
discrete_features = train_x_feature.dtypes == int

from sklearn.feature_selection import mutual_info_classif
コード例 #22
0
# Creating DataFrame first

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6)))
s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6)))
df = pd.DataFrame(s1, columns=['s1'])
df['s2'] = s2
df 


# In[72]:


# Use Scikit-Learn minmax_scaling

from mlxtend.preprocessing import minmax_scaling
minmax_scaling(df, columns=['s1', 's2'])


# In[73]:


# Now let's build a pipeline for preprocessing the numerical attributes:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
コード例 #23
0
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
import matplotlib.pyplot as plt
print("Setup Complete")

kickstarters_2017 = pd.read_csv(
    "../input/kickstarter-projects/ks-projects-201801.csv")
np.random.seed(0)

## 1) Practice scaling
## We just scaled the "usd_goal_real" column. What about the "goal" column?
## Begin by running the code cell below to create a DataFrame original_goal_data containing the "goal" column.
original_goal_data = pd.DataFrame(kickstarters_2017.goal)

## Answer:
scaled_goal_data = minmax_scaling(original_goal_data, columns=['goal'])

## 2) Practice normalization
## Now you'll practice normalization. We begin by normalizing the amount of money pledged to each campaign.
index_of_positive_pledges = kickstarters_2017.usd_pledged_real > 0
positive_pledges = kickstarters_2017.usd_pledged_real.loc[
    index_of_positive_pledges]
normalized_pledges = pd.Series(stats.boxcox(positive_pledges)[0],
                               name='usd_pledged_real',
                               index=positive_pledges.index)
fig, ax = plt.subplots(1, 2, figsize=(15, 3))
sns.distplot(positive_pledges, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(normalized_pledges, ax=ax[1])
ax[1].set_title("Normalized data")
print('Original data\nPreview:\n', positive_pledges.head())
コード例 #24
0
kickstarters_2017 = pd.read_csv(
    "D:\\work\\python\\DataClean\\ks-projects-201801.csv")

# set seed for reproducibility
np.random.seed(0)

# START : Scaling - We change the Range of the Data without changing the shape of distribution
# This means that you're transforming your data so that it fits within a specific scale, like 0-100 or 0-1.
# http://scikit-learn.org/stable/modules/preprocessing.html
# or https://stats.stackexchange.com/questions/41704/how-and-why-do-normalization-and-feature-scaling-work/254815#254815

# select the usd_goal_real column
usd_goal = kickstarters_2017.usd_goal_real
print(usd_goal.head())
# scale the goals from 0 to 1
scaled_data = minmax_scaling(usd_goal, columns=[0])
print(scaled_data)
# plot the original & scaled data together to compare
fig, ax = plt.subplots(1, 2)
sns.distplot(kickstarters_2017.usd_goal_real, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")
plt.show(block=False)
# END : Scaling

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6)))
s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6)))
df = pd.DataFrame(s1, columns=['s1'])
df['s2'] = s2
print(df)
# Select numeric columns
numeric_cols = [
    column for column in X.columns if X[column].dtype in ['int64', 'float64']
]

# Keep selected columns only
cols = low_cardinality_cols + numeric_cols
X = X[cols]

# One-hot encode the data (to shorten the code, we use pandas)
X = pd.get_dummies(X)
X.head()

# Scale the data from -1 to 1
minmax_scaling(X, columns=X.columns, min_val=-1, max_val=1)
X.head()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      train_size=0.8,
                                                      test_size=0.2,
                                                      random_state=0)

# Define the model
model = XGBClassifier(random_state=0,
                      disable_default_eval_metric=True,
                      use_label_encoder=False)

# Fit the model
コード例 #26
0
import matplotlib.pyplot as plt

kickstarters_2017=pd.read_csv("/home/pliu/Downloads/python_data_cleaning/ks-projects-201801.csv")

np.random.seed(0)


#######################################################################
###########Scalling data #############################################
#####################################################################

# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size = 1000)
#print(original_data[1],original_data[2])
# mix-max scale the data between 0 and 1, it does not change the data distribution shape
scaled_data = minmax_scaling(original_data, columns = [0])
#print(scaled_data[1],original_data[2])
# plot both together to compare
fig, ax=plt.subplots(1,2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")



#######################################################################
###########Normalize data #############################################
#####################################################################

# normalize the exponential data with boxcox
コード例 #27
0
# MinMaxScaler
## X_norm = (X - X_min) / (X_max - X_min)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

wine_minmax_scaled = scaler.fit_transform(wine_subset)

np.var(wine_minmax_scaled)

# Using mlxtend version of MinMaxScaler

from mlxtend.preprocessing import minmax_scaling

minmax_scaling(wine, columns=['ash', 'alcalinity', 'magnesium'])

########### Repeat KNN *with* scaling #################

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

knn.score(X_test, y_test)
コード例 #28
0
df6 = pd.DataFrame(INTJ, columns=['INTJ'])
df7 = pd.DataFrame(INTP, columns=['INTP'])
df8 = pd.DataFrame(INFP, columns=['INFP'])

df9 = pd.DataFrame(ENFP, columns=['ENFP'])
df10 = pd.DataFrame(ESFP, columns=['ESFP'])
df11 = pd.DataFrame(ENTJ, columns=['ENTJ'])
df12 = pd.DataFrame(ENTP, columns=['ENTP'])
df13 = pd.DataFrame(ENFJ, columns=['ENFJ'])
df14 = pd.DataFrame(ESFJ, columns=['ESFJ'])
df15 = pd.DataFrame(ESTJ, columns=['ESTJ'])
df16 = pd.DataFrame(ESTP, columns=['ESTP'])



print(minmax_scaling(df1, columns=['ISTP']))
print(minmax_scaling(df2, columns=['INFJ']))
print(minmax_scaling(df3, columns=['ISTJ']))
print(minmax_scaling(df4, columns=['ISFP']))
print(minmax_scaling(df5, columns=['ISFJ']))
print(minmax_scaling(df6, columns=['INTJ']))
print(minmax_scaling(df7, columns=['INTP']))
print(minmax_scaling(df8, columns=['INFP']))

print(minmax_scaling(df9, columns=['ENFP']))
print(minmax_scaling(df10, columns=['ESFP']))
print(minmax_scaling(df11, columns=['ENTJ']))
print(minmax_scaling(df12, columns=['ENTP']))
print(minmax_scaling(df13, columns=['ENFJ']))
print(minmax_scaling(df14, columns=['ESFJ']))
print(minmax_scaling(df15, columns=['ESTJ']))
コード例 #29
0
from mlxtend.preprocessing import minmax_scaling
get_ipython().run_line_magic('matplotlib', 'inline')

# In[2]:

#Import Dataset

df = pd.read_csv(
    'C:/Users/dell/Downloads/phishing-website-dataset/dataset.csv')
df.head()

# In[3]:

# select the IP_address column
goal = df.having_IPhaving_IP_Address
scaled_data = minmax_scaling(goal, columns=[0])

# plot the original & scaled data together to compare
fig, ax = plt.subplots(1, 2)
sns.distplot(df.having_IPhaving_IP_Address, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")

# In[4]:

a = len(df[df.Result == 0])
b = len(df[df.Result == -1])
c = len(df[df.Result == 1])
print(a, "times 0 repeated in Result")
print(b, "times -1 repeated in Result")
コード例 #30
0
import pandas as pd
import numpy as np

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6)))
s2 = pd.Series([10, 9, 8, 7, 6, 4], index=(range(6)))

# Scaling a pandas dataframe
df = pd.DataFrame(s1, columns=['s1'])
df['s2'] = s2
print df

from mlxtend.preprocessing import minmax_scaling

print minmax_scaling(df, columns=['s1', 's2'])

# Scaling a numpy array
X = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]])
print minmax_scaling(X, columns=[0, 1])