コード例 #1
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def features_engineering(self, data):		

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		data_copy = data.copy()

		data_copy['issue_d'] = pd.to_datetime(data_copy.loc[:,'issue_d'], infer_datetime_format=True)
		data_copy['issue_d_year'] = data_copy['issue_d'].apply(lambda x:str(x)[0:10].split('-')[0])
		data_copy['issue_d_year'] = data_copy['issue_d_year'].astype(int)

		data_copy['issue_d_month'] = data_copy['issue_d'].apply(lambda x:str(x)[0:10].split('-')[1])
		data_copy['issue_d_month'] = data_copy['issue_d_month'].astype(int)

		data_copy['issue_d_date'] = data_copy['issue_d'].apply(lambda x:str(x)[0:10].split('-')[2])
		data_copy['issue_d_date'] = data_copy['issue_d_date'].astype(int)

		data_copy['day_name'] = pd.Series(data_copy['issue_d']).dt.day_name()

		data_copy['loan_per_annual_inc'] = data_copy['loan_amount']/data_copy['annual_inc']
		data_copy['loan_per_annual_inc_cat'] = np.where(data_copy['loan_per_annual_inc']<=1,'<=1','>1')
		data_copy['installment_per_monthly_salary'] = data_copy['installment']/(data_copy['annual_inc']/12)
		data_copy['loan_per_annual_inc'] = data_copy['loan_amount']/data_copy['annual_inc']
		data_copy['installment_per_monthly_salary_cat'] = np.where(data_copy['installment_per_monthly_salary']>1,'>1','<=1')

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return data_copy
コード例 #2
0
ファイル: prediction.py プロジェクト: ceste/model_deployment
	def predict(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		super(Prediction, self).extract_features()

		super(Prediction, self).validate_column_type()

		super(Prediction, self).drop_this_first()

		self.X, self.y = self.split_dataframe()		

		self.X = super(Prediction, self).features_engineering(self.X)		

		self.X = super(Prediction, self).replace_infinite_numbers(self.X)		
				
		self.X, self.y = super(Prediction, self).handle_nan_values(self.X,self.y)		
		
		self.X = super(Prediction, self).drop_unnecessary_columns(self.X)		
				
		self.X = super(Prediction, self).encode_categorical_data(self.X)
		
		self.y = super(Prediction, self).encode_target_feature(self.y)			

		self.prediction = super(Prediction, self).predict(self.X)

		prediction_labels = self.decode_prediction(self.prediction)		

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return self.prediction, prediction_labels

		
コード例 #3
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def predict(self, data):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return self.model.predict(data)
コード例 #4
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def decode_target_feature(self,data):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
		
		output = self.label_encoders[self.target_column].inverse_transform(data)
		return output
コード例 #5
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def upsampling(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		smote = SMOTE()		
		self.X_sm, self.y_sm = smote.fit_sample(self.X_train, self.y_train)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #6
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def drop_unnecessary_columns(self, data):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		data.drop(self.unnecessary_columns, axis=1, inplace=True)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return data
コード例 #7
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def replace_infinite_numbers(self, data):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		# depend on your preference but this method works in this case
		data.replace([np.inf, -np.inf], np.nan, inplace=True)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return data		
コード例 #8
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def handle_target_feature(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		self.label_encoders[self.target_column] = LabelEncoder()        
		self.label_encoders[self.target_column].fit(self.y_train.values)
		self.y_train = self.label_encoders[self.target_column].transform(self.y_train.values)		
		self.save_as_pickle('label_encoders_'+self.version+'.pkl',self.label_encoders)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #9
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def evaluate_model(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		print(confusion_matrix(self.y_valid, self.y_pred))
		print("Accuracy:",accuracy_score(self.y_valid, self.y_pred))
		print('AUC:',roc_auc_score(self.y_valid,self.y_pred))
		print('Precision:',precision_score(self.y_valid,self.y_pred))
		print('Average Precision Score:',average_precision_score(self.y_valid, self.y_pred))
		print('Recall:',recall_score(self.y_valid,self.y_pred))
		print('F1 Score:',f1_score(self.y_valid,self.y_pred))

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #10
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def validate_column_type(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		for col,type_ in zip(self.dataframe.columns,self.dataframe.dtypes):

			if col!=self.target_column:

				if (str(type_)=='object' and col in self.categorical_columns and col!=self.target_column) or (str(type_)!='object' and col in self.numerical_columns and col!=self.target_column):
					continue
				else:
					raise Exception("Column type for "+col+" is not valid.")
		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #11
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def handle_nan_values(self, x,y):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		# depend on your preference but this method works in this case
		X_cols = x.columns
		columns = list(X_cols) + list([self.target_column])		

		df = pd.concat([x, y], axis=1)	
		df.dropna(inplace=True)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return df[X_cols], df[self.target_column]		
コード例 #12
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def extract_features(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		try:
			self.dataframe = self.dataframe[self.acceptable_columns]
		except:
			print("There is a problem with the dataset")


		if self.dataframe.shape[1] != len(self.acceptable_columns):
			raise Exception("Number of columns is not valid. There is problem with the dataset.")

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #13
0
ファイル: prediction.py プロジェクト: ceste/model_deployment
	def split_dataframe(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		feature_names = [col for col in self.dataframe.columns if col!=self.target_column]	

		data = self.dataframe.copy()

		X = data[feature_names]
		y = data[self.target_column]
		
		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)


		return X, y
コード例 #14
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def train_model(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		rf = RandomForestClassifier(random_state=42)
		rf.fit(self.X_sm, self.y_sm.reshape(-1,1))

		self.model = rf

		#save model
		self.save_as_pickle('model_rf_'+self.version+'.pkl',self.model)

		#compress model
		utils.compress_files(str(self.path_to_data_folder)+'compressed_model_rf_'+self.version, self.model)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #15
0
def predict():

    log = Log()
    msg = __name__ + '.' + utils.get_function_caller() + ' -> enter'
    log.print(msg)

    # get data
    json_data = request.get_json(force=True)

    msg = 'json_data: ', json_data
    log.print(msg)

    user_id = json_data['user_id']
    random_state = 42

    input_df = pd.json_normalize(json_data)

    # save json_data and input_df for debugging purpose, save using unique name
    json_data_unique_filename = config.PATH_TO_DATASET + utils.get_unique_filename(
        'json_data.json')
    input_df_unique_filename = config.PATH_TO_DATASET + utils.get_unique_filename(
        'input_df.csv')

    with open(json_data_unique_filename, 'w') as outfile:
        json.dump(json_data, outfile)

    input_df = input_df[list(config.ACCEPTABLE_COLUMNS)]
    input_df.to_csv(input_df_unique_filename, index=False)

    prediction = Prediction(user_id, input_df_unique_filename, random_state)
    predictions, labels = prediction.predict()

    result = {'prediction': int(predictions[0]), 'label': str(labels[0])}

    #dummy
    # result = {'prediction': 1, 'label': 'Good Loan'}
    output = {'result': result}
    output = result

    msg = __name__ + '.' + utils.get_function_caller() + ' -> exit'
    log.print(msg)

    return jsonify(results=output)
コード例 #16
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def train(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		self.extract_features()

		self.validate_column_type()

		self.drop_this_first()

		self.split_dataframe()

		self.X_train = self.features_engineering(self.X_train)
		self.X_valid = self.features_engineering(self.X_valid)

		self.X_train = self.replace_infinite_numbers(self.X_train)
		self.X_valid = self.replace_infinite_numbers(self.X_valid)
				
		self.X_train, self.y_train = self.handle_nan_values(self.X_train,self.y_train)
		self.X_valid, self.y_valid = self.handle_nan_values(self.X_valid,self.y_valid)
		
		self.X_train = self.drop_unnecessary_columns(self.X_train)
		self.X_valid = self.drop_unnecessary_columns(self.X_valid)
	
		self.features_type_mapping()

		self.handle_categorical_columns()		
		self.X_valid = self.encode_categorical_data(self.X_valid)

		self.handle_target_feature()
		self.y_valid = self.encode_target_feature(self.y_valid)

		self.upsampling()

		self.train_model()

		self.y_pred = self.predict(self.X_valid)

		self.evaluate_model()

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #17
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def split_dataframe(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		feature_names = [col for col in self.dataframe.columns if col!=self.target_column]	

		data = self.dataframe.copy()

		X = data[feature_names]
		y = data[self.target_column]
		
		self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

		pd.concat([self.X_train, self.y_train], axis=1).to_csv(self.path_to_data_folder+'train.csv')
		pd.concat([self.X_valid, self.y_valid], axis=1).to_csv(self.path_to_data_folder+'valid.csv')

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #18
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def handle_categorical_columns(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		label_encoders = {}
		one_hot_encoders = {} 
		dic = {}

		for col in self.categorical_features:
			
			if col!=self.target_column:
				
				label_encoders[col] = LabelEncoder()        
				label_encoders[col].fit(self.X_train[col].values.reshape(-1,1))		        
				self.X_train[col] = label_encoders[col].transform(self.X_train[col])
				dic[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))
											  
				one_hot_encoders[col] = OneHotEncoder(handle_unknown='ignore')
				one_hot_encoders[col].fit(self.X_train[col].values.reshape(-1,1))
				tmp = one_hot_encoders[col].transform(self.X_train[col].values.reshape(-1,1)).toarray()[:,1:]		        
				tmp_df = pd.DataFrame(tmp, columns=self.get_ohe_column_names(dic,col))
				
				self.X_train = pd.DataFrame(np.hstack([self.X_train,tmp_df]), columns=list(self.X_train.columns)+list(tmp_df.columns))
				
				del self.X_train[col]


		self.label_encoders = label_encoders
		self.one_hot_encoders = one_hot_encoders
		self.dict = dic

		self.save_as_pickle('label_encoders_'+self.version+'.pkl',self.label_encoders)
		self.save_as_pickle('one_hot_encoders_'+self.version+'.pkl',self.one_hot_encoders)
		self.save_as_pickle('dict_'+self.version+'.pkl',self.dict)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #19
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def encode_categorical_data(self,data):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		for col in self.categorical_features:

			if col!=self.target_column:

				# self.label_encoders[col].fit(data[col])
				data[col] = self.label_encoders[col].transform(data[col])				

				# self.one_hot_encoders[col].fit(data[col].values.reshape(-1,1))
				tmp = self.one_hot_encoders[col].transform(data[col].values.reshape(-1,1)).toarray()[:,1:]
				tmp_df = pd.DataFrame(tmp, columns=self.get_ohe_column_names(self.dict,col))

				data = pd.DataFrame(np.hstack([data,tmp_df]), columns=list(data.columns)+list(tmp_df.columns))
				del data[col]

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)

		return data
コード例 #20
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def features_type_mapping(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		categorical_features = []
		numerical_features = []

		for col,type_ in zip(self.X_train.columns, self.X_train.dtypes):

			if col!=self.target_column:
				if str(type_)=='object' and col!=self.target_column:
					categorical_features.append(col)        
				else:
					numerical_features.append(col)

		self.categorical_features = categorical_features
		self.numerical_features = numerical_features

		self.save_as_pickle('categorical_features_'+self.version+'.pkl',self.categorical_features)
		self.save_as_pickle('numerical_features_'+self.version+'.pkl',self.numerical_features)

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> exit'
		self.log.print(msg)
コード例 #21
0
ファイル: prediction.py プロジェクト: ceste/model_deployment
	def __init__(self, user_id, path_to_dataset, random_state=42):

		Pipeline.__init__(self, user_id, path_to_dataset, random_state)		

		self.log = Log()
		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)


		self.user_id = user_id
		msg = 'user_id: ',self.user_id
		self.log.print(msg)

		self.path_to_dataset = path_to_dataset
		msg = 'path_to_dataset: ',self.path_to_dataset
		self.log.print(msg)

		self.random_state = random_state
		msg = 'random_state: ',self.random_state
		self.log.print(msg)		

		self.dataframe = pd.read_csv(self.path_to_dataset)

		self.prediction = None	
コード例 #22
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def drop_this_first(self):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		self.dataframe.drop(self.drop_these_features_first, axis=1, inplace=True)	
コード例 #23
0
ファイル: prediction.py プロジェクト: ceste/model_deployment
	def decode_prediction(self, data):

		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)		

		return super(Prediction, self).decode_target_feature(data)
コード例 #24
0
ファイル: request.py プロジェクト: ceste/model_deployment
import requests
import json
import config, utils
import argparse
import pandas as pd
from pathlib import Path
from log import Log

if __name__ == '__main__':

    log = Log()
    msg = __name__ + '.' + utils.get_function_caller() + ' -> enter'
    log.print(msg)

    parser = argparse.ArgumentParser()
    parser.add_argument("--user",
                        "-u",
                        help="set user id for tracking",
                        required=True)
    args = parser.parse_args()

    msg = 'args: ', args
    log.print(msg)

    user_id = args.user

    # # local url
    url = config.URL

    # heroku url
    # url = config.HEROKU_URL
コード例 #25
0
ファイル: pipeline.py プロジェクト: ceste/model_deployment
	def __init__(self, user_id, path_to_dataset, random_state=42,test_size=0.2):

		self.log = Log()
		msg = self.__class__.__name__+'.'+utils.get_function_caller()+' -> enter'
		self.log.print(msg)

		self.user_id = user_id
		msg = 'user_id: ',self.user_id
		self.log.print(msg)

		self.path_to_dataset = path_to_dataset
		msg = 'path_to_dataset: ',self.path_to_dataset
		self.log.print(msg)

		self.random_state = random_state
		msg = 'random_state: ',self.random_state
		self.log.print(msg)

		self.test_size = test_size		
		msg = 'test_size: ',self.test_size
		self.log.print(msg)

		self.version = config.VERSION
		msg = 'version: ',self.version
		self.log.print(msg)

		self.path_to_data_folder = config.PATH_TO_DATASET		
		msg = 'path_to_data_folder: ',self.path_to_data_folder
		self.log.print(msg)		

		self.dataframe = pd.read_csv(self.path_to_dataset)

		self.acceptable_columns = config.ACCEPTABLE_COLUMNS
		msg = 'acceptable_columns: ',self.acceptable_columns
		self.log.print(msg)		

		self.unnecessary_columns = config.UNNECESSARY_COLUMNS
		msg = 'unnecessary_columns: ',self.unnecessary_columns
		self.log.print(msg)		

		self.categorical_columns = config.CATEGORICAL_COLUMNS
		msg = 'categorical_columns: ',self.categorical_columns
		self.log.print(msg)		

		self.numerical_columns = config.NUMERICAL_COLUMNS
		msg = 'numerical_columns: ',self.numerical_columns
		self.log.print(msg)		

		self.target_column = config.TARGET_COLUMN
		msg = 'target_column: ',self.target_column
		self.log.print(msg)		

		self.drop_these_features_first = config.DROP_THIS
		msg = 'drop_these_features_first: ',self.drop_these_features_first
		self.log.print(msg)		

		self.categorical_features = None
		file = Path(self.path_to_data_folder+'categorical_features_'+self.version+'.pkl')
		if file.is_file():
			self.categorical_features = self.load_pickle(file)    		

		self.numerical_features = None
		file = Path(self.path_to_data_folder+'numerical_features_'+self.version+'.pkl')
		if file.is_file():
			self.numerical_features = self.load_pickle(file)    		

		self.X_train = None
		self.X_valid = None
		self.y_train = None
		self.y_valid = None
		self.X_sm = None
		self.y_sm = None
		self.y_pred = None
		

		self.label_encoders = None
		file = Path(self.path_to_data_folder+'label_encoders_'+self.version+'.pkl')
		if file.is_file():
			self.label_encoders = self.load_pickle(file)    		

		self.one_hot_encoders = None
		file = Path(self.path_to_data_folder+'one_hot_encoders_'+self.version+'.pkl')
		if file.is_file():
			self.one_hot_encoders = self.load_pickle(file)

		self.dict = None
		file = Path(self.path_to_data_folder+'dict_'+self.version+'.pkl')
		if file.is_file():
			self.dict = self.load_pickle(file)

		self.model = None
		file = Path(self.path_to_data_folder+'model_rf_'+self.version+'.pkl')
		if file.is_file():
			self.model = self.load_pickle(file)

		file = Path(self.path_to_data_folder+'compressed_model_rf_'+self.version)
		if file.is_file():
			self.model = utils.load_compressed_files(file)