class data_getter:
    def __init__(self, file_object):
        self.logger = App_Logger()
        self.log_file = file_object

    def data_load(self, file):
        self.logger.log(self.log_file, "Entering into DATA  GETTER METHOD")
        '''
	                        Method Name: data_load
	                        Description: This method loads the data from the file and convert into a pandas dataframe
	                        Output: Returns a Dataframes, which is our data for training
	                        On Failure: Raise Exception .
	    '''
        try:
            self.logger.log(
                self.log_file,
                "Now we are starting data gathering from the file source")
            data = pd.read_csv(file, na_values='?')
            self.logger.log(
                self.log_file,
                "Now we have gathered the data frome the source and converted it into a pandas dataframe"
            )
            return data

        except Exception as e:
            self.logger.log(self.log_file,
                            "oops!!Data gathering not succesful")
            raise e
class preprocess:
    def __init__(self, file):
        self.logger = App_Logger()
        self.file = file

    def gather(self):
        log_file = open(
            r'C:\Users\poorvi\Desktop\auto_project\Training_logs\training_preprocessing_logs.txt',
            "a+")
        try:
            self.logger.log(log_file, "DATA is being gathered ")

            auto_data = pd.read_csv(self.file, header=None, na_values="?")
            self.logger.log(log_file, "DATA gathering completed ")

            auto_data.columns = [
                "symboling", "normalized-losses", "make", "fuel-type",
                "aspiration", "num-of-doors", "body-style", 'drive-wheels',
                'engine-location', 'wheel-base', 'length', 'width', 'height',
                'curb-weight', 'engine-type', 'num-of-cylinder', 'engine-size',
                'fuel-system', 'bore', 'stroke', 'compression-ratio',
                'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
            ]
            self.logger.log(log_file, "columns for data set has been set")
            log_file.close()
            return auto_data

        except Exception as e:
            self.logger.log(log_file, "Files gathering is not succesful")
            log_file.close()
            raise e

    def set_types(self, auto_data):
        log_file = open(
            r'C:\Users\poorvi\Desktop\auto_project\Training_logs\training_preprocessing_logs.txt',
            "a+")
        try:

            log_file = open("./Training_logs/preprocessing_logs.txt", "a+")
            self.logger.log(
                log_file, "Now we  will set the types of data into required")

            auto_data["normalized-losses"] = auto_data[
                "normalized-losses"].astype("float")
            auto_data["bore"] = auto_data["bore"].astype("float")
            auto_data["stroke"] = auto_data["stroke"].astype("float")
            auto_data["horsepower"] = auto_data["horsepower"].astype("float")
            auto_data["peak-rpm"] = auto_data["peak-rpm"].astype("float")
            auto_data["price"] = auto_data["price"].astype("float")

            self.logger.log(log_file,
                            "DATA Types has been set for each feature")
            log_file.close()
            return auto_data

        except Exception as e:
            self.logger.log(log_file, "setting data types was not completed")
            log_file.close()
            raise e

    def imputation(self, auto_data):
        log_file = open(
            r'C:\Users\poorvi\Desktop\auto_project\Training_logs\training_preprocessing_logs.txt',
            "a+")
        try:

            self.logger.log(
                log_file,
                "Now we  will remove the missing values from the data")

            num_col = auto_data.select_dtypes(include=[np.number]).columns
            num_col.drop("price")
            imputer = SimpleImputer()
            imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            imputer.fit(auto_data[num_col])
            auto_data[num_col] = imputer.transform(auto_data[num_col])

            self.logger.log(
                log_file,
                "missing values imputation for numerical data is done ,,,,, Now we  will handle the target variable"
            )

            auto_data.dropna(subset=["price"], axis=0, inplace=True)
            auto_data.reset_index(drop=True, inplace=True)

            cat_col = auto_data.select_dtypes(exclude=[np.number]).columns
            imputer = SimpleImputer()
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='most_frequent')
            imputer.fit(auto_data[cat_col])
            auto_data[cat_col] = imputer.transform(auto_data[cat_col])
            self.logger.log(log_file,
                            "Imputations of missing values is done----")
            auto_data.to_csv(
                r"C:\Users\poorvi\Desktop\auto_project\Training_preprocessing\preprocessed_file.csv"
            )
            log_file.close()

        except Exception as e:
            self.logger.log(log_file, "Imputation of missing values failed")
            log_file.close()
            raise e
 def __init__(self, file):
     self.logger = App_Logger()
     self.file = file
Ejemplo n.º 4
0
 def __init__(self, data, file_object):
     self.log_file = file_object
     self.data = data
     self.logger = App_Logger()
Ejemplo n.º 5
0
class model_fit:
    def __init__(self, data, file_object):
        self.log_file = file_object
        self.data = data
        self.logger = App_Logger()

    def training(self):
        '''
			                        Method Name: training
			                        Description: This method TRAINS TEH PREPROCESSED DATA FOR THE BEST MODEL
			                        Output: Returns a best model for predictions
			                        On Failure: Raise Exception .
		'''
        try:
            self.logger.log(self.log_file, "Entering into training method ")
            self.logger.log(
                self.log_file,
                "Now we willl firstly split the data into training and testing set"
            )
            X = self.data.drop('price', axis=1)
            Y = self.data['price']
            x_train, x_test, y_train, y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.25,
                                                                random_state=3)
            self.logger.log(self.log_file,
                            "Dataset splitting succesfully done")

            ##fitting with random forest regressor()
            self.logger.log(
                self.log_file,
                "Now we will fit the randomforestregressor on the training and test set"
            )
            rf = RandomForestRegressor()
            rf.fit(x_train, y_train)
            self.logger.log(
                self.log_file,
                "Randomforestregressr fitted succesfully on the training set")

            ##Now applying tuning on the randomforestregressor
            self.logger.log(
                self.log_file,
                "Now we will perfrom hyperparameter tuning on the randomforestregressor for better  results"
            )
            self.logger.log(self.log_file,
                            "Now we are setting best paramterers range")
            # Number of trees in random forest
            n_estimators = [
                int(x) for x in np.linspace(start=100, stop=1200, num=12)
            ]
            # Number of features to consider at every split
            max_features = ['auto', 'sqrt']
            # Maximum number of levels in tree
            max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
            # max_depth.append(None)
            # Minimum number of samples required to split a node
            min_samples_split = [2, 5, 10, 15, 100]
            # Minimum number of samples required at each leaf node
            min_samples_leaf = [1, 2, 5, 10]
            self.logger.log(self.log_file,
                            "Best parameters ranged succesfullly")
            random_grid = {
                'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf
            }
            rf_random = RandomizedSearchCV(estimator=rf,
                                           param_distributions=random_grid,
                                           scoring='neg_mean_squared_error',
                                           n_iter=10,
                                           cv=5,
                                           random_state=42,
                                           n_jobs=1)
            self.logger.log(
                self.log_file,
                "Randomized search cv done on randomforestregressor")
            rf_random.fit(x_train, y_train)
            self.logger.log(
                self.log_file,
                "fitting model with best parameters on the training set")
            joblib.dump(rf_random, 'model.pkl')
            self.logger.log(self.log_file, "saving the best model")
            self.log_file.close()
        except Exception as e:
            self.logger.log(
                self.log_file,
                "looks like there is some error in model training !!!try with removing errors"
            )
            self.log_file.close()
            raise e
class tuning:

	def __init__(self):
		self.logger = App_Logger()

	def tuning_xgboost(self,x_train,y_train,x_test,y_test):
		log_file = open(r"./Training_logs/training_model_tuning_logs.txt", "a+")
		try: 
			self.logger.log(log_file,"Nowe will tune the xgboost regressor with GridSearchCV")
			xgr = xgboost()
			self.logger.log(log_file,"Now setting parameter range")
			params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}	
			self.logger.log(log_file,"Estimating the best parameters for xgboost")
			grid = GridSearchCV(xgr, params)
			self.logger.log(log_file,"Best parameters estimation succesful")
			grid.fit(x_train,y_train)
			self.logger.log(log_file,"NOW fitting tuned model on the training set")
			y = grid.best_estimator_
			log_file.close()
			return y.score(x_test,y_test)

		except Exception as e:
			self.logger.log(log_file,"TUNING xgboost not succesful")
			log_file.close()
			raise e

		


	def tuning_rf(self,x_train,y_train,x_test,y_test):
		log_file = open(r"./Training_logs/training_model_tuning_logs.txt", "a+")
		try: 
			self.logger.log(log_file,"Nowe will tune the randomforest regressor with RandomizedSearchCV")
			rf = RandomForestRegressor()
			self.logger.log(log_file,"Now setting parameter range")
			random_grid ={'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}
			self.logger.log(log_file,"Estimating the best parameters for randomforest")
			rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, random_state=42, n_jobs = 1)
			self.logger.log(log_file,"Best parameters estimation succesful")
			self.logger.log(log_file,"NOW fitting tuned model on the training set")
			rf_random.fit(x_train,y_train)
			joblib.dump(rf_random,r"C:\Users\poorvi\Desktop\auto_project\model.pkl")
			log_file.close()

		except Exception as e:
			self.logger.log(log_file,"TUNING xgboost not succesful")
			log_file.close()
			raise e
	def __init__(self):
		self.logger = App_Logger()
 def __init__(self, file_object):
     self.logger = App_Logger()
     self.log_file = file_object
     self.logger.log(self.log_file,
                     "Now we are starting the preprocessing of the data")
class preprocess:
    def __init__(self, file_object):
        self.logger = App_Logger()
        self.log_file = file_object
        self.logger.log(self.log_file,
                        "Now we are starting the preprocessing of the data")

    def set_columns(self, data):
        """
                        Method Name: set_columns
                        Description: This method Sets the coloumn names for each of the columns
                        Output: Returns a Dataframes, one in which columns indexes are proper
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "Now firstly we will set the names for each column i.e column index"
        )
        try:
            data.columns = [
                "symboling", "normalized-losses", "make", "fuel-type",
                "aspiration", "num-of-doors", "body-style", 'drive-wheels',
                'engine-location', 'wheel-base', 'length', 'width', 'height',
                'curb-weight', 'engine-type', 'num-of-cylinder', 'engine-size',
                'fuel-system', 'bore', 'stroke', 'compression-ratio',
                'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
            ]
            self.logger.log(self.log_file,
                            "COLumn index set for each features succesfully")
            return data

        except Exception as e:
            self.logger.log(
                self.log_file,
                "oops!! column index for the columns can not be succesfully set"
            )
            raise e

    def target(self, data):
        """
                                Method Name: target
                                Description: This method will return the target variable further
                                Output: Returns a series os target variable
                                On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "This method will return the target variable further for model building "
        )
        try:
            self.logger.log(self.log_file,
                            "Firstly preprocesing the target variable")
            data.dropna(subset=["price"], axis=0, inplace=True)
            self.logger.log(
                self.log_file,
                "Now setting indexes back to normal after droping missing values rows from target variable"
            )
            data.reset_index(drop=True, inplace=True)
            self.logger.log(self.log_file,
                            "target variable preprocessing done")
            return data
        except Exception as e:
            self.logger.log(self.log_file,
                            "Target variable preprocessing not succesful")
            raise e

    def remove_columns(self, data):
        """
                        Method Name: remove_columns
                        Description: This method removes unncessary columns from the data
                        Output: Returns a Dataframes, one in There are only important features
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "Now  we come to the third step of preprocessing i.e removing unnecessary columns"
        )
        try:
            self.logger.log(
                self.log_file,
                "Here we are reomving some unnnecessary columns from the data which are of no use in the model building "
            )
            useful_data = data[[
                "length", "width", 'horsepower', 'curb-weight', "engine-size",
                "city-mpg", "highway-mpg", 'drive-wheels', 'num-of-cylinder',
                'price'
            ]]
            self.logger.log(
                self.log_file,
                "we have succesfully removed our unnnecessary columns ")
            return useful_data

        except Exception as e:
            self.logger.log(
                self.log_file,
                "Removal fo unncessary columns was not successful")
            raise e

    def set_type(self, data):
        """
                        Method Name: set_type
                        Description: This method set the data type oof each column corectly
                        Output: Returns a Dataframes, one in which there are correct data type of each feature
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "Now we are entering to third preprocessing step i.e setting correct daat type for each feature"
        )
        try:
            self.logger.log(
                self.log_file,
                "Here we are setting required data types for each column and then returning correct dataframe"
            )
            data.length = data.length.astype('float')
            data.width = data.width.astype('float')
            data.horsepower = data.horsepower.astype('float')
            data['curb-weight'] = data['curb-weight'].astype('float')
            data['engine-size'] = data['engine-size'].astype('float')
            data['city-mpg'] = data['city-mpg'].astype('float')
            data['highway-mpg'] = data['highway-mpg'].astype('float')
            data['drive-wheels'] = data['drive-wheels'].astype('object')
            data['num-of-cylinder'] = data['num-of-cylinder'].astype('object')
            self.logger.log(
                self.log_file,
                "we have succesfully set the correct data type for each column"
            )
            return data

        except Exception as e:
            self.logger.log(
                self.log_file,
                "looks like there is some error occured in setting data types for each columns"
            )
            raise e

    def imputation(self, data):
        """
                        Method Name: imputation
                        Description: This method removes null or missing values from the dataset
                        Output: Returns a Dataframes, one in which there are no missing values
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "Now we are starting the next step of preprocessing i.e imputation of missing values"
        )
        try:
            self.logger.log(
                self.log_file,
                "NOW WE are starting to impute missing values as per reuirements on the columns"
            )
            num_col = data.select_dtypes(
                include=[np.number]).columns.drop('price')
            cat_col = data.select_dtypes(exclude=[np.number]).columns
            imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            self.logger.log(
                self.log_file,
                "imputing the numerical columns Nan VALUES WITH MEAN")
            imputer.fit(data[num_col])
            data[num_col] = imputer.transform(data[num_col])
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='most_frequent')
            self.logger.log(
                self.log_file,
                "nOW WE ARE IMPUTING THE CATEGORICAL COLUMNS MISSING VALUES WITH MODE"
            )
            data[cat_col] = imputer.fit_transform((data[cat_col]))
            self.logger.log(self.log_file,
                            "IMPUTATION OF MISSING VALUES IS COMPLETED")
            return data

        except Exception as e:
            self.logger.log(
                self.log_file,
                "LOOKS LIKE THERE IS SOME ERROR IN IMPUTING ISSING VALUES")
            raise e

    def feature_remove(self, data):
        """
                        Method Name: feature_remove
                        Description: This method removes some columns by replacing them with new columns
                        Output: Returns a Dataframes, one in which columns are added and some are removed inplace of them
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "now we have entered in the step of feature removal or adding")
        try:
            self.logger.log(
                self.log_file,
                "Now we will add some featurs new and remove some old features"
            )
            data['area'] = data['length'] * data['width']
            data['miles'] = data['city-mpg'] - data['highway-mpg']
            self.logger.log(self.log_file,
                            "adding two new features area and miles")
            data.drop('length', inplace=True, axis=1)
            data.drop('width', inplace=True, axis=1)
            data.drop('city-mpg', inplace=True, axis=1)
            data.drop('highway-mpg', inplace=True, axis=1)
            self.logger.log(self.log_file,
                            "removing four old features on their places")
            arrange_data = data[[
                'horsepower', 'curb-weight', 'engine-size', 'drive-wheels',
                'num-of-cylinder', 'miles', 'area', 'price'
            ]]
            self.logger.log(self.log_file,
                            "feature engineering completed succesfully")
            return arrange_data

        except Exception as e:
            self.logger.log(self.log_file, "featuree engineering unsuccesful")
            raise e

    def scaling(self, data):
        """
                        Method Name: scaling
                        Description: This method Scales all the numerical features into a same range
                        Output: Returns a Dataframes, one in which all the numerical columns are in same range
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "In this step we are gonna scale all numerical features in the same range"
        )
        try:
            self.logger.log(
                self.log_file,
                "here we have started scaling the features with MinMaxScaler]")
            num_col = data.select_dtypes(
                include=[np.number]).columns.drop('price')
            sc = MinMaxScaler()
            data[num_col] = sc.fit_transform(data[num_col])
            data['num-of-cylinder'].replace(
                {
                    "three": "eight",
                    "twelve": "eight"
                }, inplace=True)
            self.logger.log(
                self.log_file,
                "Now here we have scaled all the numerical features in the same range"
            )
            return data

        except Exception as e:
            self.logger.log(self.log_file,
                            "oops!!   feature scaling not succesfull")
            raise e

    def encoding(self, data):
        """
                        Method Name: encoding
                        Description: This method encodes the categorical features into numerical for machine learning algortihms
                        Output: Returns a Dataframes, one in encoded columns for categorical columns are introduced
                        On Failure: Raise Exception .
        """
        self.logger.log(
            self.log_file,
            "Now it is the end step of preprocessing i.e encoding categorical variables"
        )
        try:
            self.logger.log(
                self.log_file,
                "here we are using dummy variables  function for encoding categorical features"
            )
            encoded_data = pd.get_dummies(data, drop_first=True)
            self.logger.log(self.log_file,
                            "encoding categorical feature done succesfully")

            return encoded_data

        except Exception as e:
            self.logger.log(
                self.log_file,
                "oops!! encoding categorical features can not be succesfully done"
            )
            raise e
Ejemplo n.º 10
0
##importing required libraries
from flask import Flask, flash, render_template, request, redirect
import flask_monitoringdashboard as dashboard
from werkzeug.utils import secure_filename
import csv
from predictionfolder.prediction import predict
from logs.logger import App_Logger
import requests
import pandas as pd
import joblib
from retraining import retraining

##for logging
flask_log = App_Logger()
file_object = open('./flask_logs.txt', 'a+')
flask_log.log(file_object, "starting user interface")

##setting allowed files criteria
flask_log.log(file_object, "setting allowed files extensions for file input")
ALLOWED_EXTENSIONS = set(['csv', 'xlsx', 'data'])

UPLOAD_FOLDER = './Charts'


##function to check whether file is is allowed extensions or not
def allowed_file(filename):
    file_object = open('./flask_logs.txt', 'a+')
    flask_log.log(file_object, "checking if file is in correct extension")
    return '.' in filename and filename.rsplit(
        '.', 1)[1].lower() in ALLOWED_EXTENSIONS
class model_building:

    log_file = open("./Training_logs/training_model_building_logs.txt", "a+")

    def __init__(self, file):
        self.logger = App_Logger()
        self.file = file

    def data_splitting(self):
        log_file = open("./Training_logs/training_model_building_logs.txt",
                        "a+")
        '''
	                        Method Name: data_splitting
	                        Description: This method loads the data and splits it into train and test
	                        Output: Returns training and testing set
	                        On Failure: Raise Exception .
	    '''
        try:
            data = pd.read_csv(self.file)
            self.logger.log(log_file, "Data splitting is now started")
            X = data.drop("price", axis=1)
            Y = data['price']
            x_train, x_test, y_train, y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.25,
                                                                random_state=3)
            self.logger.log(log_file,
                            "Data is now splitted into training and test set")
            log_file.close()
            return x_train, y_train, x_test, y_test

        except Exception as e:
            self.logger.log(log_file, "Data splitting is not finished")
            log_file.close()
            raise e

    def randomforest_reg(self, x_train, y_train, x_test, y_test):
        log_file = open("./Training_logs/training_model_building_logs.txt",
                        "a+")
        '''
	                        Method Name: randomforest_reg
	                        Description: This method fits the randomforest regressor on the training data
	                        Output: Returns a Dataframes, which is our data for training
	                        On Failure: Raise Exception .
	    '''
        try:
            self.logger.log(
                log_file,
                "Now we will fit RandomForestRegressor in the training set")
            rf = RandomForestRegressor()
            rf.fit(x_train, y_train)
            ypred1 = rf.predict(x_test)
            self.logger.log(
                log_file,
                "RandomForestRegressor is now fitted on to the training set")
            log_file.close()
            return rf.score(x_test, y_test)

        except Exception as e:
            self.logger.log(log_file,
                            "Model fitting randomforest not succesful")
            log_file.close()
            raise e

    def xgboost_reg(self, x_train, y_train, x_test, y_test):
        log_file = open("./Training_logs/training_model_building_logs.txt",
                        "a+")
        '''
	                        Method Name: data_load
	                        Description: This method loads the data from the file and convert into a pandas dataframe
	                        Output: Returns a Dataframes, which is our data for training
	                        On Failure: Raise Exception .
	    '''
        try:
            self.logger.log(
                log_file,
                "Now we will fit Xgboostregressor in the training set")
            xg = RandomForestRegressor()
            xg.fit(x_train, y_train)
            ypred1 = xg.predict(x_test)
            self.logger.log(
                log_file,
                "Xgboostregressor is now fitted on to the training set")
            log_file.close()
            return xg.score(x_test, y_test)

        except Exception as e:
            self.logger.log(log_file, "Model fitting xgboost not succesful")
            log_file.close()
            raise e
 def __init__(self, file_object):
     self.logger = App_Logger()
     self.log_file = file_object