X = x.reshape(-1, 1)
y = (y_no_noise + rnd.normal(size=len(x))) / 2
plt.plot(X, y, 'o', c='r')
plt.show()

line = np.linspace(-5, 5, 1000, endpoint=False).reshape(-1, 1)
mlpr = MLPRegressor().fit(X, y)
knr = KNeighborsRegressor().fit(X, y)
plt.plot(line, mlpr.predict(line), label='MLP')
plt.plot(line, knr.predict(line), label='KNN')
plt.plot(X, y, 'o', c='r')
plt.legend(loc='best')
plt.show()

bins = np.linspace(-5, 5, 11)
target_bin = np.digitize(X, bins=bins)
print(bins)

onehot = OneHotEncoder(sparse=False)
onehot.fit(target_bin)
X_in_bin = onehot.transform(target_bin)
new_line = onehot.transform(np.digitize(line, bins=bins))

new_mlpr = MLPRegressor().fit(X_in_bin, y)
new_knr = KNeighborsRegressor().fit(X_in_bin, y)
plt.plot(line, new_mlpr.predict(new_line), label='New MLP')
plt.plot(line, new_knr.predict(new_line), label='New KNN')
plt.plot(X, y, 'o', c='r')
plt.legend(loc='best')
plt.show()
Example #2
0
LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])
X[:, 2] = LabelEncoder_X.fit_transform(X[:, 2])
X[:, 4] = LabelEncoder_X.fit_transform(X[:, 4])
X[:, 5] = LabelEncoder_X.fit_transform(X[:, 5])
X[:, 7] = LabelEncoder_X.fit_transform(X[:, 7])
X[:, 8] = LabelEncoder_X.fit_transform(X[:, 8])
X[:, 10] = LabelEncoder_X.fit_transform(X[:, 10])
X[:, 12] = LabelEncoder_X.fit_transform(X[:, 12])

# Processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [5])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)
X = X[:, 1:]

# Processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [8])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)
X = X[:, 1:]

# Spliting the data value in train and test
from sklearn.model_selection import train_test_split
Example #3
0
imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imputer = imputer.fit(X[:, 7:8])
X[:, 7:8] = imputer.transform(X[:, 7:8])

#see cor plot of numeric variables not working
#sns.set(style="ticks", color_codes=True)
#g = sns.pairplot(dataset_pred,hue="isFraud")

#use encoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
#labelencoder_X_2 = LabelEncoder()
#X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
#create dummy variable for type 5 type as there are catagory not working
onehotencoder = OneHotEncoder(categorical_features=[
    1
])  #apply in column type as there are more than 2 catagory
X = onehotencoder.fit_transform(X).toarray()

# splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
#check traing data spread fraud and nonfraud are equally sprea in train and test
np.count_nonzero(y_test == 1)
np.count_nonzero(y_test == 0)
np.count_nonzero(y_train == 1)
np.count_nonzero(y_train == 0)

#feature scalling
Example #4
0
# split into attributes and labels
target = 'age_group'
X = df.drop(['srcid', target], axis=1)
X = X.drop(['p_state', 'sDevType', 'sOSName', 'education', 'gender'], axis=1)
# X = X.drop(['Q6b','Q6c','Q6d','Q7b','Q7c','Q7d','Q10b','Q10c','Q10d'], axis=1)
y = df[target]
''' srcid,Q1,Q2,Q4,Q6a,Q6b,Q6c,Q6d,Q7a,Q7b,Q7c,Q7d,Q7e,Q10a,Q10b,Q10c,Q10d,
    p_state,sDevType,sOSName,gender,age_group,education'''

# # Convert features to Ordinal values
# ordinalencoder_X = OrdinalEncoder()
# X = ordinalencoder_X.fit_transform(X)
# X = X.astype(int)

# Convert features to OneHotEncoding values
one_hot_encoder_X = OneHotEncoder()
X = one_hot_encoder_X.fit_transform(X).toarray()
X = X.astype(int)
print(X)

# Convert target to Ordinal values
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

nb_list = [MultinomialNB, ComplementNB, GaussianNB, BernoulliNB]
result_list = [[] for x in range(len(nb_list))]

for j in range(20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    for i in range(len(nb_list)):
Example #5
0
        E = df["death"]
        T = df["futime"]
        X = df >> drop(X.death, X.futime, X.chapter) \
                >> mutate(mgus=X.mgus.astype(float), age=X.age.astype(float))
        X = X[T > 0]
        E = E[T > 0]
        T = T[T > 0]
        #Y = np.c_[np.log(T) - np.mean(np.log(T)), C]
        Y = Y_join(T, E)
        X_num = X.select_dtypes(include=["float"])
        X_cat = X.select_dtypes(exclude=["float"])
        imputer = SimpleImputer(strategy="median")
        X_num = imputer.fit_transform(X_num.values)
        imputer = SimpleImputer(strategy="most_frequent")
        X_cat = imputer.fit_transform(X_cat.values)
        encoder = OneHotEncoder(sparse=False)
        X_cat = encoder.fit_transform(X_cat)
        X = np.c_[X_num, X_cat]

    elif args.dataset == "support":
        df = pd.read_csv("./data/surv/support2.csv")
        df = df.rename(columns={"d.time": "dtime"})
        T = df["dtime"]
        E = df["death"]
        #Y = np.c_[np.log(T) - np.mean(np.log(T)), C]
        Y = Y_join(T, E)
        df >>= drop(X.dtime, X.death, X.hospdead, X.prg2m, X.prg6m, X.dnr,
                     X.dnrday, X.aps, X.sps, X.surv2m, X.surv6m, X.totmcst)
        X_num = df.select_dtypes(include=["float", "int"])
        X_cat = df.select_dtypes(exclude=["float", "int"])
        imputer = SimpleImputer(strategy="median")
Example #6
0
from keras.models import load_model
from keras import optimizers
from keras_gradient_noise import add_gradient_noise
noisy = add_gradient_noise(optimizers.RMSprop)
from sklearn.preprocessing import OneHotEncoder
from config import window_size, feature_len
import numpy as np

m = load_model("model", custom_objects={"NoisyRMSprop": noisy})
number_of_notes = 50
rand = np.random.randint(0, feature_len, size=[window_size])
ohe = OneHotEncoder(n_values=feature_len, sparse=False)

music = []
music.extend(list(rand))
for i in range(number_of_notes):
    a = np.array(music[i:i + window_size]).reshape([-1, 1])
    rand = ohe.fit_transform(a)
    pred = m.predict(rand.reshape([1, window_size, feature_len]))
    music.append(np.argmax(pred))

music = music[window_size:]
with open("classes.txt", "r") as f:
    classes = f.readlines()

# one hot decode yap
# sonra label decode yap
# karsilik gelen note ve chordlardan stream olustur
# stream'i midi dosyasina yaz
# kaydet
labels = []
labelEncoder_previsores = LabelEncoder()

#Atributo da coluna 1 é categórico
previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1])
previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3])
previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6])
previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7])
previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13])

#Existe uma ineficiência nessa solução, pois essas variáveis trasnformadas são do tipo nominal
#No caso não posso dizer por exemplo que uma raça é melhor que outra

onehotencoder = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13])
previsores = onehotencoder.fit_transform(previsores).toarray()

labelEncoder_classe = LabelEncoder()

classe = labelEncoder_classe.fit_transform(classe)

standardScaler = StandardScaler()
previsores = standardScaler.fit_transform(previsores)

###########################CRIAÇÃO BASE DE TESTE###############################

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.15, random_state=0)
    'SubscriberIndex', 'SubgroupIndex'
]

#separate categorical and numeric features
Mcat = np.array(Jcodes_w_L[cat_features].tolist())
Mnum = np.array(Jcodes_w_L[numeric_features].tolist())

L = np.array(Jcodes_w_L[label].tolist())

#Setup One Hot Encoding
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621
#https://towardsdatascience.com/encoding-categorical-features-21a2651a065c

#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
ohe = OneHotEncoder(sparse=False)  #Easier to read
Mcat = ohe.fit_transform(Mcat)
ohe.inverse_transform(Mcat)
ohe_features = ohe.get_feature_names(cat_features).tolist()

M = np.concatenate((Mcat, Mnum), axis=1)

#Concatenate the columns
#M = np.concatenate((Mcat_subset, Mnum_subset), axis=1)

L = Jcodes_w_L[label].astype(int)
n_folds = 5

#EDIT: pack the arrays together into "data"
data = (M, L, n_folds)
Example #9
0
import pandas as pd

dataset = pd.read_csv('Churn_Modelling.csv')

#Dividing Dataset
X = dataset.iloc[:, 3:-1].values
Y = dataset.iloc[:, -1].values

#encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lb_X = LabelEncoder()
X[:, 1] = lb_X.fit_transform(X[:, 1])
X[:, 2] = lb_X.fit_transform(X[:, 2])

oneh = OneHotEncoder(categorical_features=[1])
X = oneh.fit_transform(X).toarray()

lb_Y = LabelEncoder()
Y = lb_Y.fit_transform(Y)

#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

#scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
Example #10
0
def pcafilter():

    # Setting up inputs

    parser = argparse.ArgumentParser()
    parser.add_argument('tilefile', metavar='TILEFILE', help='File containing files of tile var matrix')
    parser.add_argument('tilepath', metavar='TILEPATH', help='File containing information about tile locations')

    args = parser.parse_args()
    rcParams.update({'figure.autolayout': True})


    if not os.path.exists('Images'):
      os.makedirs('Images')

    tiledata_file = args.tilefile
    tilepath_file = args.tilepath

    print("Reading in Data...")

    tiledata= np.load(tiledata_file)
    pathdata = np.load(tilepath_file)

    tile_path = np.trunc(pathdata/(16**5))
    idx1 = tile_path >= 863
    idx2 = tile_path <= 810
    idx3 = idx2

    idxOP = np.arange(pathdata.shape[0])
    idxOP = idxOP[idx3]

    pathdata = pathdata[idx3]

    print(tiledata.shape)
    tiledata = tiledata[:,idx3]
    print(tiledata.shape)

    tiledata = tiledata + 2

    nnz = np.count_nonzero(tiledata,axis=0)
    fracnnz = np.divide(nnz.astype(float),tiledata.shape[0])

    # Only keeping data that has less than 1% missing data

    idxKeep = fracnnz >= 0.99

    tiledata = tiledata[:,idxKeep]

    print("Encoding in 1-hot...")
    print("Determing new path and varval vectors...")

    print(tiledata.shape)

    def foo(col):
       u = np.unique(col)
       nunq = u.shape
       return nunq

    invals = np.apply_along_axis(foo, 0, tiledata)
    invals = invals[0]

    varvals = np.full(50*tiledata.shape[1],np.nan)
    nx=0

    varlist = []

    for j in range(0,tiledata.shape[1]):
        u = np.unique(tiledata[:,j])
        varvals[nx:nx+u.size] = u
        nx = nx + u.size
        varlist.append(u)

    varvals = varvals[~np.isnan(varvals)]

    print(varvals.shape)

    enc = OneHotEncoder(sparse=True, dtype=np.uint16)

    Xtrain = enc.fit_transform(tiledata)

    print(Xtrain.shape)

    to_keep = varvals > 1
    idkTK = np.nonzero(to_keep)
    idkTK = idkTK[0]

    Xtrain = Xtrain[:,idkTK]

    scipy.sparse.save_npz('XtrainPCA.npz', Xtrain)
'''
  pclass     sex  age  sibsp  parch   fare embarked
0       2  Female   17      0      0  12.00        C
1       3  Female   37      0      0   9.59        S
2       3    Male   18      1      1  20.21        S
3       3    Male   30      0      0   7.90        S
4       3    Male   25      0      0   7.65        S
'''

print(x_train_df.columns)
'''
Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')
'''

transformer = make_column_transformer(
    (OneHotEncoder(), ['pclass', 'sex', 'embarked']), remainder='passthrough')
transformer.fit(x_train_df)
x_train = transformer.transform(x_train_df)
x_test = transformer.transform(x_test_df)

y_train = y_train_df.values
y_test = y_test_df.values

print(x_train.shape)
print(y_train.shape)
'''
   pclass     sex  age  sibsp  parch   fare embarked
0       2  Female   17      0      0  12.00        C
1       3  Female   37      0      0   9.59        S
2       3    Male   18      1      1  20.21        S
3       3    Male   30      0      0   7.90        S
import matplotlib.pyplot as plt

"""**Importing Dataset**"""

dataset= pd.read_csv("50_Startups.csv")
x= dataset.iloc[:, :-1]
y=dataset.iloc[:, -1]

print (x)

"""**Encoding Categorical Data**"""

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct= ColumnTransformer( transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

print (x)

"""**Seperate Test Set and Training Set**"""

from sklearn.model_selection import train_test_split
x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.2 , random_state=0)

"""**Training the Multiple Linear Regression Model**"""

from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(x_train, y_train)
Example #13
0
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 20 18:04:03 2018

@author: admin
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("affairs.csv")
features = data.iloc[:, :-1].values
labels = data.iloc[:, -1].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
oh = OneHotEncoder(categorical_features=[6, 7])
le = LabelEncoder()

features = oh.fit_transform(features).toarray()
features = features[:, 1:]

from sklearn.model_selection import train_test_split
f_train, f_test, l_train, l_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.25,
                                                    random_state=0)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(f_train, l_train)
l_pred = classifier.predict(f_test)
Example #14
0
def clean_data(data):

    # Copy data
    X = data.to_pandas_dataframe()
    X.set_index('Id',inplace=True)
    print(X.head())
    print()

    # Remove rows with missing target, separate target from predictors
    X.dropna(axis=0, subset=['SalePrice'], inplace=True)
    y = X.SalePrice 

    # Remove target and 'Utilities' 
    X.drop(['SalePrice', 'Utilities'], axis=1, inplace=True)

    print(X.shape)

    # Select object columns
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

    # Select numeric columns
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]

    # Imputation lists

    # imputation to null values of these numerical columns need to be 'constant'
    constant_num_cols = ['GarageYrBlt', 'MasVnrArea']
    #constant_num_cols = ['MasVnrArea']
    print("constant_num_cols")
    print(constant_num_cols)
    print

    # imputation to null values of these numerical columns need to be 'mean'
    mean_num_cols = list(set(numerical_cols).difference(set(constant_num_cols)))
    print("mean_num_cols")
    print(mean_num_cols)
    print()

    # imputation to null values of these categorical columns need to be 'constant'
    constant_categorical_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
    print("constant_categorical_cols")
    print(constant_categorical_cols)
    print()

    # imputation to null values of these categorical columns need to be 'most_frequent'
    mf_categorical_cols = list(set(categorical_cols).difference(set(constant_categorical_cols)))
    print("mf_categorical_cols")
    print(mf_categorical_cols)
    print()

    my_cols = constant_num_cols + mean_num_cols + constant_categorical_cols + mf_categorical_cols
    print("my_cols")
    print(my_cols)
    print()

    # Define transformers
    # Preprocessing for numerical data

    numerical_transformer_m = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())])

    numerical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)),('scaler', StandardScaler())])

    # Preprocessing for categorical data for most frequent
    categorical_transformer_mf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))])

    # Preprocessing for categorical data for constant
    categorical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='NA')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))])


    # Bundle preprocessing for numerical and categorical data
    #preprocessor = ColumnTransformer(transformers=[
    #    ('num_mean', numerical_transformer_m, mean_num_cols),
    #    ('num_constant', numerical_transformer_c, constant_num_cols),
    #    ('cat_mf', categorical_transformer_mf, mf_categorical_cols),
    #    ('cat_c', categorical_transformer_c, constant_categorical_cols)])
    preprocessor = ColumnTransformer(transformers=[
        ('num_mean', numerical_transformer_m, mean_num_cols),
        ('cat_mf', categorical_transformer_mf, mf_categorical_cols),
        ('cat_c', categorical_transformer_c, constant_categorical_cols)])

    X = preprocessor.fit_transform(X)
    
    
    return X, y
Example #15
0
df = df_subscribers.pivot(
    index='ticket_id', columns='item_name', values='item_count').fillna(0)
df_subscribers.reset_index(inplace=True)
df_subscribers.drop(columns='index', inplace=True)

# --- add back date and location
df = df.merge(df_subscribers[['ticket_id', 'location', 'order_timestamp']
                             ].drop_duplicates(), how='left', on='ticket_id')

# --- extract hour of day from datetime
df['hour'] = df['order_timestamp'].apply(get_hour)
# df['hour'] = df['order_timestamp'].apply(lambda x: x.hour)

# --- convert categorical store variables to dummies
# use sklearn.preprocessing.OneHotEncoder() to create a class object called encoded_data
encoded_data = OneHotEncoder(handle_unknown='ignore')
# call the method used to fit data for a OneHotEncorder object.
# Note: you will have to reshape data from a column of the data frame.
# useful functions may be DataFrame methods .to_list(), .reshape(), and .shape()
encoded_data.fit(X=np.array(df['location'].tolist()).reshape(df.shape[0], 1))
# fixed split with regex to avoid IndexError
col_map_store_binary = dict(zip(list(encoded_data.get_feature_names()), [
    'store_' + re.split('x\d_', x)[1] for x in encoded_data.get_feature_names()]))

# fix transform data
df_store_binary = pd.DataFrame(
    encoded_data.fit_transform(df[['location']]).toarray())
# df_store_binary = pd.DataFrame(encoded_data.transform(
#    X=np.array(df['location'].tolist()).reshape(df.shape[0], 1)))
df_store_binary.columns = encoded_data.get_feature_names()
df_store_binary.rename(columns=col_map_store_binary, inplace=True)
    #  'credit_history': {'critical': 0,
    #   'delayed': 2,
    #   'fully repaid': 3,
    #   'fully repaid this bank': 4,
    #   'repaid': 1}}
    for col in cols:
        df[col] = df[col].map(map[col])
    return df

## 5.2 <序号编码>Ordinary Encoding
OrdinalEncoder(categories=’auto’, dtype=<class ‘numpy.float64’>)
## 5.3 <独热编码>One-hot Encoding
'''
会导致高维度特征,应配合特征选择来降低维度
'''
OneHotEncoder(n_values=None, categorical_features=None, categories=None, drop=None, sparse=True, dtype=<class ‘numpy.float64’>, handle_unknown=’error’)# 热编码,若有n个类,则生成n个特征,其中一个是1其余是0.
# `sparse`:默认为True表示用稀疏矩阵表示,一般使用`.toarray()`转换到False,即数组。
## 5.4 <二进制编码>Binary Encoding
'''
用二进制来表示不同的类别如3表示为011
维度少于独热编码
'''
## 5.6 其它编码方式,比如Helmert Contrast、Sum Contrast、Polynomial Contrast、Backward Difference Contrast等。


LabelEncoder().fit_transform(data[feature].astype(np.str)
# 对类别特征进行 OneEncoder
data = pd.get_dummies(data, columns=['model', 'brand', 'bodyType', 'fuelType',
                                     'gearbox', 'notRepairedDamage', 'power_bin'])

Example #17
0
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
# print(X);
# print(y);

#Handle Missing Data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
# print(X);

#Encoding Categorical Data
#Encoding Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])],
                       remainder="passthrough")
X = np.array(ct.fit_transform(X))
# print(X);
#Encoding Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = np.array(le.fit_transform(y))
# print(y);

#Splitting Dataset into Training Set & Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
smoker = pd.DataFrame(smoker)
smoker.columns = ['smoker']
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Sklearn label encoder results for smoker:")
print(le_smoker_mapping)
print(smoker[:10])

#option3: sklearn one hot encoding: maps each category to 0 (cold) or 1 (hot)

#one hot encoder = ohe

#create ndarray for one hot encodoing (sklearn)
region = data.iloc[:, 5:6].values  #ndarray

## ohe for region
ohe = OneHotEncoder()

region = ohe.fit_transform(region).toarray()
region = pd.DataFrame(region)
region.columns = ['northeast', 'northwest', 'southeast', 'southwest']
print("Sklearn one hot encoder results for region:")
print(region[:10])

############################################01_05_DividingtheDataintoTestandTrain##############################################

#putting the data together:

##take the numerical data from the original data
X_num = data[['age', 'bmi', 'children']].copy()

##take the encoded data and add to numerical data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import OLS

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

# Encoding categorical data

labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categories='auto')
X = onehotencoder.fit_transform(X).toarray()

# Avoiding the Dummy Variable Trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Fitting Multiple Linear Regression to the Training set

regressor = LinearRegression()
allData = pd.read_csv('data_tenis.csv')
temperature = allData.iloc[:, 1:2].values  #categoric data column
humidity = allData.iloc[:, 2:3].values  #to be predicted
outlook = allData.iloc[:, 0:1].values  #to be encoded
windy = allData.iloc[:, 3:4].values  #to be encoded
play = allData.iloc[:, 4:5].values  #to be encoded

#encoding what to encode(Categoric -> Numeric) and create data frames
""" as another method, encode all columns with LabelEncoder and take the part which you need to encode
from sklearn.preprocessing import LabelEncoder
allDataLabelEncoded = allData.apply(LabelEncoder().fit_transform)
labelEncoded = allDataLabelEncoded.iloc[:,-2:]
"""

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categories="auto")

outlook = ohe.fit_transform(outlook).toarray()
outlook = pd.DataFrame(data=outlook,
                       index=range(14),
                       columns=['overcast', 'rainy', 'sunny'])

windy = ohe.fit_transform(windy).toarray()
windy = pd.DataFrame(data=windy[:, 1:], index=range(14), columns=['windy'])

play = ohe.fit_transform(play).toarray()
play = pd.DataFrame(data=play[:, 1:], index=range(14), columns=['play'])

temperature = pd.DataFrame(data=temperature,
                           index=range(14),
                           columns=['temperature'])
heatmap = (HeatMap().add_xaxis(xaxis_data=corr.index.to_list()).add_yaxis(
    '', yaxis_data=corr.index.to_list(), value=data))
heatmap.render()
"""
可以看到,“访问深度”和“平均停留时间”相关性比较高,相关性高说明两个变量在建
立模型的时候,作用是一样或者效果是一样的,可以考虑组合或者删除其一。
"""

# %%
# 数据处理
# 1.缺失值处理(前面已经做了,这里省略)

# 2.独热编码
cols = df.columns[-5:].to_list()

model_ohe = OneHotEncoder(sparse=False)  # 建立OneHotEncoder对象
ohe_matrix = model_ohe.fit_transform(df[cols])

# 3.数据标准化:
cols = df.columns[1:-5].to_list()
model_scaler = MinMaxScaler()
scaler_matrix = model_scaler.fit_transform(df[cols])

# 4.分类数据和数值数据合并
X = np.hstack((scaler_matrix, ohe_matrix))

# %%
# 建立模型
# 通过平均轮廓系数检验得到最佳Kmeans聚类模型
score_list = []  # 存储每个k下模型的平均轮廓系数
silhouette_int = -1  # 初始化的平均轮廓系数阀值
Example #22
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 11 11:05:48 2018

@author: ASUS
"""

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
data = [
    'cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot'
]
values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)
#import the dataset
dataset = pd.read_csv('Data.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

#handling missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

#encoding categorical data
LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))
LabelEncoder_y = LabelEncoder()
y = LabelEncoder_y.fit_transform(y)

#splitting dataset into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

#feature scaling
scale_x = StandardScaler()
X_train = scale_x.fit_transform(X_train)
X_test = scale_x.transform(X_test)
#transformation of the target class by replacing attack names with "attack"
information_base['normal.'] = information_base['normal.'].replace(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.', 'neptune.', 'nmap.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.', 'warezmaster.'], 'attack')

#preprocessing of data, transformation of categorical values
x = information_base.iloc[:, :-1].values
y = information_base.iloc[:, 41].values

LEncoderX1 = LabelEncoder()
LEncoderX2 = LabelEncoder()
LEncoderX3 = LabelEncoder()
x[:, 1] = LEncoderX1.fit_transform(x[:, 1])
x[:, 2] = LEncoderX2.fit_transform(x[:, 2])
x[:, 3] = LEncoderX3.fit_transform(x[:, 3])

OHEncoder1 = OneHotEncoder(categorical_features = [1])
x = OHEncoder1.fit_transform(x).toarray()
OHEncoder2 = OneHotEncoder(categorical_features = [4])
x = OHEncoder2.fit_transform(x).toarray()
OHEncoder3 = OneHotEncoder(categorical_features = [70])
x = OHEncoder3.fit_transform(x).toarray()

LEncoderY = LabelEncoder()
y = LEncoderY.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

#scaling of data
scalerX = StandardScaler()
x_train = scalerX.fit_transform(x_train)
x_test = scalerX.transform(x_test)
Example #25
0
from sklearn.metrics import explained_variance_score, mean_absolute_error, \
    r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from ajna_commons.flask.conf import (DATABASE, MONGODB_URI)
from ajna_commons.conf import ENCODE
from padma.models.peso.peso import PesoModel
from padma.models.bbox.bbox import NaiveModel
from padma.models.conteiner20e40.bbox import SSDMobileModel

pesomodel = PesoModel()
bboxmodel = NaiveModel()
bboxmodel = SSDMobileModel()
encoder = OneHotEncoder()
encoder.fit([[i] for i in range(20)])

BASE_PATH = os.path.dirname(__file__)
HIST_FILE = os.path.join(BASE_PATH, 'histograms.npy')
LABEL_FILE = os.path.join(BASE_PATH, 'labels.npy')
CSV_FILE = os.path.join(BASE_PATH, 'pesovolexport.csv')
IMGOUT_PATH = os.path.join(BASE_PATH, 'images')


def make_histograms():
    histograms = []
    labels = []
    print('Connecting to MongoDB...')
    db = MongoClient(host=MONGODB_URI)[DATABASE]
    fs = GridFS(db)
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
	random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
	ret.append(m)
	return ret

start = time.time()

inp = reader(filename1)
x_train = inp[0]
y_train = inp[1]
training_examples = inp[2]

outp = reader(filename2)
x_test = outp[0]
y_test = outp[1]
testing_examples = outp[2]

enc = OneHotEncoder()
x_com = enc.fit_transform(x_train)

x_train = x_com.toarray()

x_com1 = enc.fit_transform(x_test)

x_test = x_com1.toarray()

end1 = time.time()
# print(x_train)


def forward(X,WEIGHTS,BIAS):
	temp = X
new_encoded = dict_one_hot_encoder.transform(new_dict)
print(new_encoded)

X_str = np.array([['tech', 'professional'],
                  ['fashion', 'student'],
                  ['fashion', 'professional'],
                  ['sports', 'student'],
                  ['tech', 'student'],
                  ['tech', 'retired'],
                  ['sports', 'professional']])

label_encoder = LabelEncoder()
X_int = label_encoder.fit_transform(X_str.ravel()).reshape(*X_str.shape)
print(X_int)

one_hot_encoder = OneHotEncoder()
X_encoded = one_hot_encoder.fit_transform(X_int).toarray()
print(X_encoded)

# not seen in training data

new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'},
            {'interest': 'tech', 'occupation': 'unseen_occupation'}]

new_encoded = dict_one_hot_encoder.transform(new_dict)
print(new_encoded)

# new category not encountered before
new_str = np.array([['unknown_interest', 'retired'],
                    ['tech', 'unseen_occupation'],
                    ['unknown_interest', 'unseen_occupation']])
@author: sidneaux
Multiple Linear Regression
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing a dataset
df = pd.read_csv('50_startups.csv')
X = df.iloc[:, -1].values
y = dataset.iloc[:, 4].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

# Avoidind the dummy variable trap
X = X[:, 1:]

#using label encoder (il y a deux categorie de y)
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
#Splitting into training and test sets
from sklearn.Cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Example #30
0
# Importer les librairies
import numpy as np
import matplotlib.pyplot as mplt
import pandas as pd

dtst = pd.read_csv('EU_I_PIB.csv')
X = dtst.iloc[:, -4:].values
y = dtst.iloc[:, -5].values

#Gerer lA dummy

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelEnc_X =  LabelEncoder()
X[:,0]= labelEnc_X.fit_transform(X[:,0])
OnehotEnc_X = OneHotEncoder(categorical_features= [0])
X= OnehotEnc_X.fit_transform(X).toarray()

# division de l'echantillon
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2,random_state = 0)

#Construire notre modele de regression Multiple
 from sklearn.linear_model import LinearRegression
 regresseur = LinearRegression()
 regresseur.fit(X_train,y_train)
 
 #Faire de nouvelles prediction
y_prediction = regresseur.predict(X_test)

#