def test_pre_process(self): import math dataBot = DataBot( self.df, target_name='Survived', project_path='./tests') expected = { 'Pclass': 0.46, 'Age': 0.30, 'SibSp': 0.05, 'Parch': 0.05, 'Fare': 0.05, 'Sex_male': 0.64, 'Sex_female': 0.35, 'Embarked_S': 0.72, 'Embarked_C': 0.18, 'Embarked_Q': 0.08 } dataBot.pre_process() features_average = dataBot.features.describe().loc['mean'].to_dict() for key in features_average.keys(): self.assertAlmostEqual(expected[key], features_average[key], delta=0.1)
def create_model(): dataset = None dataset_processed = None models = None scores = None best_model = None if request.method == 'POST': print(request.form) project_path = f'{PROJECTS_FOLDER}{request.form.get("project_name")}' if not os.path.exists(project_path): os.mkdir(project_path) dataset = pd.read_csv(request.form.get('dataset_path')) columns_types = [key for key in request.form.keys() if '_type' in key] for column in columns_types: col_data = column.split('_') dataset[col_data[0]] = dataset[col_data[0]].astype(request.form.get(column)) dataset.to_csv(f'{project_path}/dataset.csv', index=False) dataBot = DataBot(dataset=dataset, project_path=project_path, target_name=request.form.get('target'), null_threshold=float(request.form.get('null_threshold')) / 100, cardinal_threshold=float(request.form.get('cardinal_threshold')) / 100) dataBot.pre_process() dataset_processed = dataBot.get_dataset() dataset_processed.to_csv(f'{project_path}/dataset_processed.csv', index=False) model = Model(dataset_processed, request.form.get('target')) model.train_models() best_model = model.save_best_model(f'{project_path}/model.joblib') models = list(model.training_results['learner'].values) scores = list(model.training_results['test_score'].values) project_info = { 'project_name': [request.form.get("project_name")], 'project_path': [project_path], 'model_name': [best_model.learner.__class__.__name__], 'model_score': [best_model.test_score], 'target': [request.form.get("target")], 'null_threshold': [request.form.get("null_threshold")], 'cardinal_threshold': [request.form.get("cardinal_threshold")] } project = Project(project_info) project.save() return render_template( 'model_info.html', dataset=dataset.head(3), dataset_processed=dataset_processed.head(), models=models, scores=scores)
def post(self): json_request = request.get_json() data = json_request['data'] for key in data.keys(): data[key] = [data[key]] project = Project() project_info = project.get(json_request['project_name']) project_info = project_info.to_dict(orient='records')[0] model = load(f"{project_info['project_path']}/model.joblib") data = pd.DataFrame(data) dataBot = DataBot(dataset=data, project_path=project_info['project_path']) datasetAttributes = DataSetAtrributes(project_info['project_path']) datasetAttributes.load() dataBot.pre_process_prediction(datasetAttributes.parameters) prediction = list(model.predict(dataBot.features)) prediction = str(prediction[0]) print(prediction) return {'prediction': prediction}
def test_one_hot_encode(self): dataBot = DataBot(self.df, target_name='Survived') dataBot.impute(['Embarked'], ImputerStrategy.MODE) dataBot.one_hot_encode('Embarked', self.df['Embarked'].values) encoded_cols = {'Embarked_S', 'Embarked_C', 'Embarked_Q'} self.assertTrue(encoded_cols.intersection(set(dataBot.features.columns)) == encoded_cols)
def test_remove_high_cardinality_columns(self): dataBot = DataBot(self.df, target_name='Survived', cardinal_threshold=0.5) dataBot.remove_high_cardinality_columns() columns = ['PassengerId', 'Name', 'Cabin'] self.assertTrue(columns not in list(dataBot.features.columns))
def test_remove_null_columns(self): dataBot = DataBot(self.df, target_name='Survived') dataBot.remove_null_columns() self.assertTrue('B' not in dataBot.features.columns) self.assertTrue('C' not in dataBot.features.columns)
def test_normalize(self): dataBot = DataBot(self.df, target_name='Survived') dataBot.impute(['Age'], ImputerStrategy.MEAN) dataBot.normalize(['Age']) self.assertTrue(np.allclose(272.84, dataBot.features['Age'].sum(), 2))
def test_impute(self): dataBot = DataBot(self.df, target_name='Survived') dataBot.impute(['Age'], ImputerStrategy.MEAN) expected_a = 21205.17 self.assertTrue(np.allclose(expected_a, dataBot.features['Age'].sum(), 2))