def test_pre_process(self):
        import math

        dataBot = DataBot(
            self.df,
            target_name='Survived',
            project_path='./tests')

        expected = {
            'Pclass': 0.46,
            'Age': 0.30,
            'SibSp': 0.05,
            'Parch': 0.05,
            'Fare': 0.05,
            'Sex_male': 0.64,
            'Sex_female': 0.35,
            'Embarked_S': 0.72,
            'Embarked_C': 0.18,
            'Embarked_Q': 0.08
        }

        dataBot.pre_process()
        features_average = dataBot.features.describe().loc['mean'].to_dict()
        for key in features_average.keys():
            self.assertAlmostEqual(expected[key], features_average[key], delta=0.1)
Beispiel #2
0
def create_model():
    dataset = None
    dataset_processed = None
    models = None
    scores = None
    best_model = None
    if request.method == 'POST':
        print(request.form)
        project_path = f'{PROJECTS_FOLDER}{request.form.get("project_name")}'
        if not os.path.exists(project_path):
            os.mkdir(project_path)
        dataset = pd.read_csv(request.form.get('dataset_path'))
        columns_types = [key for key in request.form.keys() if '_type' in key]
        for column in columns_types:
            col_data = column.split('_')
            dataset[col_data[0]] = dataset[col_data[0]].astype(request.form.get(column))
        dataset.to_csv(f'{project_path}/dataset.csv', index=False)
        dataBot = DataBot(dataset=dataset,
                          project_path=project_path,
                          target_name=request.form.get('target'),
                          null_threshold=float(request.form.get('null_threshold')) / 100,
                          cardinal_threshold=float(request.form.get('cardinal_threshold')) / 100)
        dataBot.pre_process()

        dataset_processed = dataBot.get_dataset()
        dataset_processed.to_csv(f'{project_path}/dataset_processed.csv', index=False)

        model = Model(dataset_processed, request.form.get('target'))
        model.train_models()
        best_model = model.save_best_model(f'{project_path}/model.joblib')
        models = list(model.training_results['learner'].values)
        scores = list(model.training_results['test_score'].values)

        project_info = {
            'project_name': [request.form.get("project_name")],
            'project_path': [project_path],
            'model_name': [best_model.learner.__class__.__name__],
            'model_score': [best_model.test_score],
            'target': [request.form.get("target")],
            'null_threshold': [request.form.get("null_threshold")],
            'cardinal_threshold': [request.form.get("cardinal_threshold")]
        }

        project = Project(project_info)
        project.save()

    return render_template(
        'model_info.html',
        dataset=dataset.head(3),
        dataset_processed=dataset_processed.head(),
        models=models,
        scores=scores)
Beispiel #3
0
    def post(self):
        json_request = request.get_json()

        data = json_request['data']
        for key in data.keys():
            data[key] = [data[key]]
        project = Project()
        project_info = project.get(json_request['project_name'])
        project_info = project_info.to_dict(orient='records')[0]
        model = load(f"{project_info['project_path']}/model.joblib")
        data = pd.DataFrame(data)

        dataBot = DataBot(dataset=data, project_path=project_info['project_path'])
        datasetAttributes = DataSetAtrributes(project_info['project_path'])
        datasetAttributes.load()
        dataBot.pre_process_prediction(datasetAttributes.parameters)
        prediction = list(model.predict(dataBot.features))
        prediction = str(prediction[0])
        print(prediction)
        return {'prediction': prediction}
    def test_one_hot_encode(self):
        dataBot = DataBot(self.df, target_name='Survived')

        dataBot.impute(['Embarked'], ImputerStrategy.MODE)
        dataBot.one_hot_encode('Embarked', self.df['Embarked'].values)
        encoded_cols = {'Embarked_S', 'Embarked_C', 'Embarked_Q'}
        self.assertTrue(encoded_cols.intersection(set(dataBot.features.columns)) == encoded_cols)
    def test_remove_high_cardinality_columns(self):
        dataBot = DataBot(self.df, target_name='Survived', cardinal_threshold=0.5)

        dataBot.remove_high_cardinality_columns()
        columns = ['PassengerId', 'Name', 'Cabin']
        self.assertTrue(columns not in list(dataBot.features.columns))
    def test_remove_null_columns(self):
        dataBot = DataBot(self.df, target_name='Survived')
        dataBot.remove_null_columns()

        self.assertTrue('B' not in dataBot.features.columns)
        self.assertTrue('C' not in dataBot.features.columns)
 def test_normalize(self):
     dataBot = DataBot(self.df, target_name='Survived')
     dataBot.impute(['Age'], ImputerStrategy.MEAN)
     dataBot.normalize(['Age'])
     self.assertTrue(np.allclose(272.84, dataBot.features['Age'].sum(), 2))
    def test_impute(self):
        dataBot = DataBot(self.df, target_name='Survived')
        dataBot.impute(['Age'], ImputerStrategy.MEAN)
        expected_a = 21205.17

        self.assertTrue(np.allclose(expected_a, dataBot.features['Age'].sum(), 2))