def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] model_dir = config["model_dir"] file_object = open('Training_log.txt', 'a+') logger = App_Logger() df = pd.read_csv(train_data_path) #Reading the processed dataset df["date"] = pd.to_datetime(df["date"]).dt.date X_train = df[df['date'] <= datetime.date( 2017, 5, 31)] #splitting the dataset based on date for trainging data val_X = df[df['date'] > datetime.date( 2017, 5, 31)] #spliting the dataset based on date for validation data logger.log(file_object, "Splitting dataset completed") X_train = X_train.drop(['date'], axis=1) val_X = val_X.drop(['date'], axis=1) y_train = np.log1p((X_train["transactionRevenue"]).values) val_y = np.log1p((val_X["transactionRevenue"]).values) logger.log(file_object, "Log transformation of transaction Revenue values completed") x1 = X_train.drop(['transactionRevenue'], axis=1) val_x1 = val_X.drop(['transactionRevenue'], axis=1) y_train = pd.DataFrame(y_train) val_y = pd.DataFrame(val_y) ################## MLFLOW ###################### mlflow_config = config["mlflow_config"] remote_server_uri = mlflow_config['remote_server_uri'] mlflow.set_tracking_uri(remote_server_uri) mlflow.set_experiment(mlflow_config["experiment_name"]) with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run: model_xgb = run_xgb(x1, y_train) y_train_predict = model_xgb.predict(x1) rmse, mae, r2 = eval_metrics(y_train, y_train_predict) mlflow.log_param("n_estimators", 1200) mlflow.log_param("learning_rate", 0.5) mlflow.log_param("max_depth", 8) mlflow.log_metric('rmse', rmse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme if tracking_url_type_store != "file": mlflow.sklearn.log_model( model_xgb, "model", registered_model_name=mlflow_config["registered_model_name"]) else: mlflow.sklearn.load_model(model_xgb, "model") ##################### Saving the model as pickle file ################################ logger.log(file_object, "Model file created successfully") file_object.close()
def __init__(self): self.client = pymongo.MongoClient('mongodb://127.0.0.1:27017') self.path = 'Training_Database' self.good_file_path = 'Prediction_Raw_Validated_File/Good_Raw' self.bad_file_path = 'Prediction_Raw_Validated_File/Bad_Raw' self.FileFromDB = 'PredictionFileFrom_DB' self.logger = App_Logger()
def __init__(self, batch_file_path): self.raw_data = Raw_Data_Validation(batch_file_path) self.db_operation = DB_Operations() self.preprocess_beforeDB = preprocessing_beforeDB() self.file_object = open('Training_Logs/Training_Validation_Log.txt', 'a+') self.logger_object = App_Logger()
def __init__(self): # self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() self.mongo = To_mongo_db('wafer') self.aws = Aws_Bucket_operation( local_file_name_address='config/bucket_name')
def __init__(self): self.path = 'Training_Database' self.badFilePath = "Training_Bad_Raw_Files_Validated" self.goodFilePath = "Training_Good_Raw_Files_Validated" self.logger = App_Logger() self.awsObj = AwsStorageManagement() self.dbObj = mongoDBOperation()
def __init__( self, models_list, sampling_method, ): self.logger_object = App_Logger() self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+') self.sampling_method = sampling_method self.models_list = models_list
def __init__(self): #self.rootProjPath =rootProjPath #my_file = self.rootProjPath+'\\Training_Database\\' #self.path = my_file #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\Bad_Raw' #self.badFilePath = my_file #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw' #self.goodFilePath = my_file self.path = 'Training_Database/' self.badFilePath = "Training_Raw_files_validated/Bad_Raw" self.goodFilePath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger()
def __init__(self): #my_file = rootProjPath+'\\Prediction_Database\\' #self.path = my_file #my_file = rootProjPath+'\\Prediction_Raw_Files_Validated\\Bad_Raw' #self.badFilePath = my_file #my_file = rootProjPath+'\\Prediction_Raw_Files_Validated\\Good_Raw' #self.goodFilePath = my_file self.path = 'Prediction_Database/' self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw" self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw" #self.rootProjPath=rootProjPath self.logger = App_Logger()
def predictInputFile(): filedir = 'Prediction_Batch_Files' loggerObj = App_Logger() now = datetime.now() df = pd.DataFrame() currdate = now.strftime("%d%m%Y") currtime = now.strftime("%H%M%S%f") features = [ 'family', 'product-type', 'steel', 'carbon', 'hardness', 'temper_rolling', 'condition', 'formability', 'strength', 'non-ageing', 'surface-finish', 'surface-quality', 'enamelability', 'bc', 'bf', 'bt', 'bw/me', 'bl', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl', 'ferro', 'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's', 'p', 'shape', 'thick', 'width', 'len', 'oil', 'bore', 'packing' ] try: if request.form is not None: df = pd.read_csv(request.files.get("CsvDoc"), header=None, names=features) # Now export to csv df.to_csv(filedir + '/' + 'Annealing_' + currdate + '_' + currtime + '.csv', index=False, header=False) pred_valObj = predict_validation(loggerObj, filedir) # object initialization pred_valObj.prediction_validation( ) # calling the prediction_validation function pred_obj = prediction(loggerObj) # object initialization path = pred_obj.predictionFromModel( ) # calling the function to predict the data inputFile = pd.read_csv("Prediction_FileFromDB/InputFile.csv", header=None, names=features) result = pd.read_csv("Prediction_Output_File/Predictions.csv") X = pd.concat([inputFile, result], axis=1, sort=False) return Response("Prediction File created at %s!!!" % path + " " + "prediction results are given below %s" % X.head().to_html()) except ValueError: print("Error Occurred! " + str(ValueError)) return Response("Error Occurred! %s" % str(ValueError)) except KeyError: print("Error Occurred! " + str(KeyError)) return Response("Error Occurred! %s" % KeyError) except Exception as e: print("Error Occurred! " + str(e)) return Response("Error Occurred! %s" % e)
def predictRouteClient(): loggerObj = App_Logger() features = [ 'family', 'product-type', 'steel', 'carbon', 'hardness', 'temper_rolling', 'condition', 'formability', 'strength', 'non-ageing', 'surface-finish', 'surface-quality', 'enamelability', 'bc', 'bf', 'bt', 'bw/me', 'bl', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl', 'ferro', 'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's', 'p', 'shape', 'thick', 'width', 'len', 'oil', 'bore', 'packing' ] try: if request.json is not None: content = json.loads(request.get_json()) pred_valObj = predict_validation( loggerObj) # object initialization pred_valObj.singleRecValidation( content) # calling the prediction_validation function pred_obj = prediction(loggerObj) # object initialization result = pred_obj.predictionFromModel( content) # calling the function to predict the data return Response("Predicted class is %s!!!" % result) elif request.form is not None: path = request.form['filepath'] pred_valObj = predict_validation(loggerObj, path) # object initialization pred_valObj.prediction_validation( ) #calling the prediction_validation function pred_obj = prediction(loggerObj) # object initialization path = pred_obj.predictionFromModel( ) # calling the function to predict the data inputFile = pd.read_csv("Prediction_FileFromDB/InputFile.csv", header=None, names=features) result = pd.read_csv("Prediction_Output_File/Predictions.csv") X = pd.concat([inputFile, result], axis=1, sort=False) return Response("Prediction File created at %s!!!" % path + " " + "prediction results are given below %s" % X.head().to_html()) except ValueError: print("Error Occurred! " + str(ValueError)) return Response("Error Occurred! %s" % str(ValueError)) except KeyError: print("Error Occurred! " + str(KeyError)) return Response("Error Occurred! %s" % KeyError) except Exception as e: print("Error Occurred! " + str(e)) return Response("Error Occurred! %s" % e)
def __init__(self): self.file_object = open("../logs/modeltune/log.txt", 'a+') self.saved_best_model_path = '../saved_model/best_model.sav' self.logger = App_Logger() self.transformed_data = dataTransform() self.df = self.transformed_data.trainingData() self.data = self.df.iloc[:, :-1] self.label = self.df.iloc[:, -1] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.data, self.label, test_size=0.2, random_state=0, stratify=self.label) self.BRF = BalancedRandomForestClassifier(n_jobs=-1) self.EEC = EasyEnsembleClassifier(n_jobs=-1)
def __init__(self): self.AWS_KEY_ID = os.getenv('AWS_KEY_ID') # AWS KEY ID self.AWS_SECRET = os.getenv('AWS_SECRET') # AWS SECRET KEY #Creating a s3 client to access resources self.s3 = boto3.client("s3", region_name='ap-south-1', aws_access_key_id=self.AWS_KEY_ID, aws_secret_access_key=self.AWS_SECRET) #Create a resource object self.res_s3 = boto3.resource('s3', region_name='ap-south-1', aws_access_key_id=self.AWS_KEY_ID, aws_secret_access_key=self.AWS_SECRET) #self.bucket_name='sensor-fault-detection' self.bucket_name = 'mushroomtypeclassifier' self.logger = App_Logger() self.file = 'awsBucketManagementLogs'
def trainRouteClient(): loggerObj = App_Logger() try: if request.form is not None: path = request.form['filepath'] valObj = train_validation(path, loggerObj) # object initialization valObj.training_validation( ) # calling the training_validation function train_obj = trainModel(loggerObj) # object initialization train_obj.trainingModel( ) # training the model for the files in the table except ValueError: return Response("Error Occurred! %s" % ValueError) except KeyError: return Response("Error Occurred! %s" % KeyError) except Exception as e: return Response("Error Occurred! %s" % e) return Response("Training successfull!!")
def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger()
def __init__(self, sql_db_name='wafer'): self.logger = App_Logger(sql_db_name)
def predict(): if request.method == "POST": # get user input from html form loan_amount_form = request.form['loan_amount'] term_form = request.form['term'] interest_rate_form = request.form['interest_rate'] employment_years_form = request.form['employment_years'] annual_income_form = request.form['annual_income'] state_form = request.form['state'] debt_to_income_form = request.form['debt_to_income'] delinquent_2yr_form = request.form['delinquent_2yr'] revolving_cr_util_form = request.form['revolving_cr_util'] total_accounts_form = request.form['total_accounts'] longest_credit_length_form = request.form['longest_credit_length'] home_ownership_form = request.form['home_ownership'] verification_status_form = request.form['verification_status'] loan_purpose_form = request.form['loan_purpose'] # load the model logger = App_Logger() file_prediction = open("Logs/Prediction_Log.txt", 'a+') file_io = File_Operations(logger_object=logger, file_object=file_prediction) model = file_io.load_model('EasyEnsembleClassifier') # load the encoded features dictionary with open('encoded_features.json', 'r') as myfile: encoded_features_str = myfile.read() encoded_features = json.loads(encoded_features_str) # Features Preprocessing # Preprocess loan_amount # (no need to preprocess loan_amount since it will be a number. i will just convert to float) loan_amount = float(loan_amount_form) loan_amount_list = [loan_amount] # Preprocess term # term will either be '36 months' or 60 'months'. I will convert it to int since the datatype of this feature when training happend was int term_tokens = term_form.split('months') term = int(term_tokens[0]) term_list = [term] # Preprocess interest_rate # no need to preprocess interest_rate since it is a number. i will just convert it to float interest_rate = float(interest_rate_form) interest_rate_list = [interest_rate] # Preprocess Employment_Years # no need to preprocess Employment_Years since it is a number, i will just convert it to float employment_years = float(employment_years_form) employment_years_list = [employment_years] # Preprocess Annual_Income # no need to preprocess Annual_Income sinnce it is a number. I will just convert it to float annual_income = float(annual_income_form) annual_income_list = [annual_income] # Preprocess State # load the dic_state dictionary with open('dic_state.json', 'r') as myfile: dic_state_str = myfile.read() dic_state = json.loads(dic_state_str) state = dic_state[state_form] state_list = [state] # Preprocess debt_to_income debt_to_income = float(debt_to_income_form) debt_to_income_list = [debt_to_income] # Preprocess Delinquent_2yr # no need to preprocess Delinquent_2yr since it is a number. I will just convert it to float. delinquent_2yr = float(delinquent_2yr_form) delinquent_2yr_list = [delinquent_2yr] # Preprocess Revolving_Cr_Util # no need to preprocess Revolving_Cr_Util since it is a number. I will just convert it to float. revolving_cr_util = float(revolving_cr_util_form) revolving_cr_util_list = [revolving_cr_util] # Preprocess Total_Accounts # no need to preprocess Total_Accounts since it is a number. I will just convert it to float. total_accounts = float(total_accounts_form) total_accounts_list = [total_accounts] # Preprocess Longest_Credit_Length # no need to preprocess Longest_Credit_Length since it is a number. I will just convert it to float longest_credit_length = float(longest_credit_length_form) longest_credit_length_list = [longest_credit_length] # Preprocess Home_Ownership if home_ownership_form == 'RENT': Home_Ownership_MORTGAGE = 0 Home_Ownership_OWN = 0 Home_Ownership_RENT = 1 elif home_ownership_form == 'OWN': Home_Ownership_MORTGAGE = 0 Home_Ownership_OWN = 1 Home_Ownership_RENT = 0 elif home_ownership_form == 'MORTGAGE': Home_Ownership_MORTGAGE = 1 Home_Ownership_OWN = 0 Home_Ownership_RENT = 0 elif home_ownership_form == 'OTHER': Home_Ownership_MORTGAGE = 0 Home_Ownership_OWN = 0 Home_Ownership_RENT = 0 elif home_ownership_form == 'NONE': Home_Ownership_MORTGAGE = 0 Home_Ownership_OWN = 0 Home_Ownership_RENT = 0 home_ownership_list = [ Home_Ownership_MORTGAGE, Home_Ownership_OWN, Home_Ownership_RENT ] # Preprocess Verification Status if verification_status_form == 'VERIFIED - income': Verification_Status_VERIFIED_income = 1 Verification_Status_VERIFIED_income_source = 0 Verification_Status_not_verified = 0 elif verification_status_form == 'VERIFIED - income source': Verification_Status_VERIFIED_income = 0 Verification_Status_VERIFIED_income_source = 1 Verification_Status_not_verified = 0 elif verification_status_form == 'not verified': Verification_Status_VERIFIED_income = 0 Verification_Status_VERIFIED_income_source = 0 Verification_Status_not_verified = 1 verification_status_list = [ Verification_Status_VERIFIED_income, Verification_Status_VERIFIED_income_source, Verification_Status_not_verified ] # Preprocess Loan Purpose new_loan_purpose = 'Loan_Purpose_' + loan_purpose_form encoded_loan_purpose = encoded_features['Loan_Purpose'] loan_purpose_vec = np.zeros(len(encoded_loan_purpose)) for i in range(len(encoded_loan_purpose)): if encoded_loan_purpose[i] == new_loan_purpose: loan_purpose_vec[i] = 1 loan_purpose_list = list(loan_purpose_vec) # append all inputs into a single list X = loan_amount_list + term_list + interest_rate_list + employment_years_list + annual_income_list + state_list + debt_to_income_list + delinquent_2yr_list + revolving_cr_util_list + total_accounts_list + longest_credit_length_list + home_ownership_list + verification_status_list + loan_purpose_list X_arr = np.array([X]) # convert list to numpy array pred = model.predict(X_arr) if pred[0] == 1: final_pred = 'BAD LOAN. LOAN DISSAPROVAL!' else: final_pred = 'GOOD LOAN. LOAN APPROVAL!' return render_template('home.html', prediction_text=final_pred) return render_template("home.html")
def __init__(self): self.log_writer = App_Logger() self.file_object = open("Training_Logs/ModelTrainingLog.txt", "a+")
def __init__(self, training_file_path): self.logger = App_Logger() self.file_training = open("Logs/ModelTraining_Log.txt", 'a+') self.file_preprocessing = open('Logs/DataPreprocessing_Log.txt', 'a+') self.training_file_path = training_file_path
def __init__(self, path): self.Batch_Directory = path #my_file = rootProjPath+'\\schema_training.json' #self.schema_path = my_file self.schema_path = 'schema_training.json' self.logger = App_Logger()
import pandas as pd from utility import TextSumarization import argparse from application_logging.logger import App_Logger import time logger_object = App_Logger() general_logs = open("Logs/generallogs.txt", '+a') success_file = open("Logs/successlogs.txt", '+a') error_file = open("Logs/errorlogs.txt", '+a') def run(): parser = argparse.ArgumentParser( description='Summarization of TextData using pretrained models') parser.add_argument('-path', dest='path', default='Final_news.csv', help='File path') parser.add_argument( '-model', dest='model', default='bert-base-uncased', help='[xlnet-base-cased,distilbert-base-uncased,albert-base-v1]') args = parser.parse_args() try: logger_object.log( general_logs, "Entered the runner file --> Running the script now !") if not args.path: raise RuntimeError("Must supply text path.")
def preprocess_and_split(config_path): file_object = open('Training_log.txt', 'a+') logger = App_Logger() config = read_params(config_path) train_data_path = config["split_data"]["train_path"] raw_train_data_path = config["load_data"]["raw_train_data_csv"] logger.log(file_object, "Training Data load was successful") train_df = pd.read_csv(raw_train_data_path, nrows=100000) logger.log(file_object, "Data reading successful") # 1.Function for extracting features from date column train_df = date_process( train_df) # function for datetime cols processing in train data logger.log(file_object, "Datetime Processing in train data completed ") # 2. Function to validate the columns in the dataset for json datatype train_json_columns = column_validator( train_df ) # Validating the columns in the train dataset for json datatype logger.log(file_object, "Column_validator successful") # 2.1 Function for flattening the json columns and merge them with original dataset if train_json_columns is not None: train_df = json_to_df( train_df, train_json_columns) #Normalizing the json columns in train data target = train_df['transactionRevenue'] logger.log(file_object, "Normalizing the json columns completed") # 3.Dropping columns which have more than 50% of null values and columns not contributing to the target variable train_df = remove_nan_cols(train_df) logger.log(file_object, "50% NAN value columns are removed") train_df.drop( 'sessionId', axis=1, inplace=True ) # Removing this column as it is the combination of fullVisitorId and visitId train_df.drop( 'visitStartTime', axis=1, inplace=True) # Removing this column as it is extracted into visitHour train_df.drop( 'fullVisitorId', axis=1, inplace=True ) # This column is very long and of no much contribution towards target variable #drop_columns = ['visitId', 'weekday', 'day', 'bounces', 'keyword'] drop_columns = ['visitId', 'weekday', 'day'] train_df.drop(drop_columns, axis=1, inplace=True) logger.log( file_object, 'Dropped columns which are not contributing to the transaction revenue' ) # 4.Imputation of null values train_df = pd.concat( [train_df, target], axis=1 ) # transactionRevenue col is attached to the dataframe for imputing nan with 0 train_df = impute_na(train_df) logger.log(file_object, "Imputing NAN values with 0 is completed") # 5.Changing datatypes from object to desired ones train_df = data_type_convert(train_df) logger.log(file_object, "Conversion of Datatype to int completed") # 6. Removing columns with constant values or with zero standard deviation train_df = remove_zero_std_cols(train_df) logger.log(file_object, "Zero standard deviation columns are removed") # # 7 Function to gather categorical columns in the dataset and performing label encoding label_cols = categorical_cols(train_df) logger.log(file_object, "Gathering of label _cols in train data completed ") train_df = label_encoding(train_df, label_cols) logger.log(file_object, "Label_encoding in train data completed ") # 8. Imputing pageviews column with KNNImputer in train data from sklearn.impute import KNNImputer imputer = KNNImputer() imputer_train_df = imputer.fit_transform(train_df[[ 'pageviews' ]]) ## Imputing pageviews with KNNimputer in training data train_df['pageviews'] = imputer_train_df logger.log(file_object, "Pageviews column imputed with KNNimputer") train_df.to_csv(train_data_path, sep=",", index=False, encoding="utf-8") ## Storing Processed train data logger.log( file_object, "Traning data is processed and stored as data/processed/train_processed.csv" ) file_object.close()
def __init__(self): self.goodDataPath = "Prediction_Good_Raw_Files_Validated" self.logger = App_Logger() self.awsObj = AwsStorageManagement()
def __init__(self): self.path = 'Training_Database/' self.badFilePath = "Training_Raw_files_validated/Bad_Raw" self.goodFilePath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger()
def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_training.json' self.logger = App_Logger()
def __init__(self): self.path = 'Prediction_Database/' self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw" self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger()
def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() self.awsObj = AwsStorageManagement() self.dbObj = mongoDBOperation()
def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger()
def __init__(self): self.training_file='../training_file/Churn_Modelling.csv' self.file_object = open("../logs/filereadlogs/log.txt", 'a+') self.logger = App_Logger()
def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger()
def __init__(self, path): self.raw_data = Raw_Data_Validation(path) self.dataTransform = dataTransform() self.dBOperation = dBOperation() self.file_object = open("Training_Logs/Training_Main_Log.txt", 'a+') self.log_writer = App_Logger()