dataset_path = input( "Enter the path of you CSV file") #input the path of your csv file dataset = pd.read_csv(dataset_path) #reading data from the csv file X = dataset.iloc[:, : -1].values #storing the attributes (Change the iloc pattern according to the need and your dataset) Y = dataset.iloc[:, 3].values #storing the value to be predicted or dependent variable (Change the iloc pattern according to the need and your dataset) # >>> Handeling missing data X[:, 1:3] = imp( missing_values=np.nan, strategy='mean', verbose=0 ).fit_transform( X[:, 1:3] ) #If using Sklearn version 0.20+ (Change the iloc pattern according to the need and your dataset) #If using sklearn version -0.20 (Change the iloc pattern accourding to the need and your dataset) """ imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) # Initialising the Impute object (it calls __inti__ function of imputter) imputer = imputer.fit(X[:, 1:3]) # Fitting the imputer value to Nan from column index 1 to 2 X[:, 1:3] = imputer.transform(X[:, 1:3]) #Transforming the NaN positions in original dataset """ # >>> Encoding caterogical data label_encoder_X = LabelEncoder() #creates an instance of LabelEncoder X[:, 0] = label_encoder_X.fit_transform( X[:, 0]
in this the first semi colon indicates the row i.e. starting row:ending row+1 in this second semi colon indicates columns i.e. starting column:ending column+1 and lastly the .values indicates that we have to include all the values. x=independent variable''' x = datasets.iloc[:, :-1].values #y=dependent variable y = datasets.iloc[:, 3].values '''in real world a lot of data is missing from different columns i.e. a part of single entity. we may think of deleteing the whole row but we cannot do this because it may contain some crucial information so what we do is take mean of all the columns of which data is missing and place it in that column. this is the way we deal with MISSING DATA''' from sklearn.impute import SimpleImputer as imp #impute is Transformers for missing value imputation #simpleImputer is Imputation transformer for completing missing values. imputer = imp(missing_values=np.nan, strategy='mean', copy=True) #use ctrl+I for reading about rest imputer = imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) '''now one more important thing is that in machine learning we will deal with mathematical data so we need to do something for the data that is in character format. in our data set we see two columns to have character data Country and Purchased, these are Categorical variables i.e. they can be cateorized for say purchased can be 1.Yes 2.No Country can be 1.Germany 2.Spain 3.France so our now new motive is to encode Catgorical variables in mathematical or numerical format''' from sklearn.preprocessing import LabelEncoder label = LabelEncoder() x[:, 0] = label.fit_transform(x[:, 0]) #here label.fit_transform(x[:,0]) returns array after transforming 0th column and replaces it with x[:,0] y = label.fit_transform(y) '''now the problem is that it will assign 0 1 and 2 to different countries but this will create prefrential order
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler # >>> Importing dataset dataset = pd.read_csv('data.csv') #reading data from the csv file X = dataset.iloc[:, :-1].values #storing the attributes Y = dataset.iloc[:, 3].values #storing the value to be predicted or dependent variable # >>> Handeling missing data X[:, 1:3] = imp(missing_values=np.nan, strategy='mean', verbose=0).fit_transform(X[:, 1:3]) #If using Sklearn version 0.20+ #If using sklearn version -0.20 """ imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) # Initialising the Impute object (it calls __inti__ function of imputter) imputer = imputer.fit(X[:, 1:3]) # Fitting the imputer value to Nan from column index 1 to 2 X[:, 1:3] = imputer.transform(X[:, 1:3]) #Transforming the NaN positions in original dataset """ # >>> Encoding caterogical data label_encoder_X = LabelEncoder() #creates an instance of LabelEncoder X[:, 0] = label_encoder_X.fit_transform( X[:, 0]) #Fits and transforms the LabelEncoder to the first column of X one_hot_encoder = OneHotEncoder(
import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.compose import ColumnTransformer # import Dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values #missing values from sklearn.impute import SimpleImputer as imp imputer = imp(missing_values=np.nan, strategy='mean') imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) from sklearn.preprocessing import OneHotEncoder enc = ColumnTransformer([("Country", OneHotEncoder(categories='auto'), [0])], remainder='passthrough') X = enc.fit_transform(X) #Splitting dataset into train_set and test_set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #feature Scaling from sklearn.preprocessing import StandardScaler scaler = StandardScaler()