コード例 #1
0
dataset_path = input(
    "Enter the path of you CSV file")  #input the path of your csv file

dataset = pd.read_csv(dataset_path)  #reading data from the csv file

X = dataset.iloc[:, :
                 -1].values  #storing the attributes (Change the iloc pattern according to the need and your dataset)
Y = dataset.iloc[:,
                 3].values  #storing the value to be predicted or dependent variable (Change the iloc pattern according to the need and your dataset)

# >>> Handeling missing data

X[:, 1:3] = imp(
    missing_values=np.nan, strategy='mean', verbose=0
).fit_transform(
    X[:, 1:3]
)  #If using Sklearn version 0.20+ (Change the iloc pattern according to the need and your dataset)

#If using sklearn version -0.20 (Change the iloc pattern accourding to the need and your dataset)
"""
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)         # Initialising the Impute object (it calls __inti__ function of imputter)
imputer = imputer.fit(X[:, 1:3])                          # Fitting the imputer value to Nan from column index 1 to 2
X[:, 1:3] = imputer.transform(X[:, 1:3])                        #Transforming the NaN positions in original dataset
"""

# >>> Encoding caterogical data

label_encoder_X = LabelEncoder()  #creates an instance of LabelEncoder
X[:, 0] = label_encoder_X.fit_transform(
    X[:, 0]
in this the first semi colon indicates the row i.e. starting row:ending row+1
in this second semi colon indicates columns i.e. starting column:ending column+1
and lastly the .values indicates that we have to include all the values.
x=independent variable'''
x = datasets.iloc[:, :-1].values

#y=dependent variable
y = datasets.iloc[:, 3].values
'''in real world a lot of data is missing from different columns i.e. a part of single entity.
we may think of deleteing the whole row but we cannot do this because it may contain some crucial information 
so what we do is take mean of all the columns of which data is missing and place it in that column.
this is the way we deal with MISSING DATA'''
from sklearn.impute import SimpleImputer as imp
#impute is Transformers for missing value imputation
#simpleImputer is Imputation transformer for completing missing values.
imputer = imp(missing_values=np.nan, strategy='mean', copy=True)
#use ctrl+I for reading about rest
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])
'''now one more important thing is that in machine learning we will deal with mathematical data so we need to
do something for the data that is in character format. in our data set we see two columns to have character data
Country and Purchased, these are Categorical variables i.e. they can be cateorized for say purchased can be 1.Yes 2.No
Country can be 1.Germany 2.Spain 3.France
so our now new motive is to encode Catgorical variables in mathematical or numerical format'''
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
x[:, 0] = label.fit_transform(x[:, 0])
#here label.fit_transform(x[:,0]) returns array after transforming 0th column and replaces it with x[:,0]
y = label.fit_transform(y)
'''now the problem is that it will assign 0 1 and 2 to different countries but this will create prefrential order 
コード例 #3
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# >>> Importing dataset

dataset = pd.read_csv('data.csv')  #reading data from the csv file

X = dataset.iloc[:, :-1].values  #storing the attributes
Y = dataset.iloc[:,
                 3].values  #storing the value to be predicted or dependent variable

# >>> Handeling missing data

X[:,
  1:3] = imp(missing_values=np.nan, strategy='mean',
             verbose=0).fit_transform(X[:,
                                        1:3])  #If using Sklearn version 0.20+

#If using sklearn version -0.20
"""
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)         # Initialising the Impute object (it calls __inti__ function of imputter)
imputer = imputer.fit(X[:, 1:3])                          # Fitting the imputer value to Nan from column index 1 to 2 
X[:, 1:3] = imputer.transform(X[:, 1:3])                        #Transforming the NaN positions in original dataset
"""

# >>> Encoding caterogical data

label_encoder_X = LabelEncoder()  #creates an instance of LabelEncoder
X[:, 0] = label_encoder_X.fit_transform(
    X[:, 0])  #Fits and transforms the LabelEncoder to the first column of X
one_hot_encoder = OneHotEncoder(
コード例 #4
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer

# import Dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

#missing values
from sklearn.impute import SimpleImputer as imp
imputer = imp(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

from sklearn.preprocessing import OneHotEncoder
enc = ColumnTransformer([("Country", OneHotEncoder(categories='auto'), [0])],
                        remainder='passthrough')
X = enc.fit_transform(X)

#Splitting dataset into train_set and test_set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()