Ejemplo n.º 1
0
    def test_retrive_models(self):

        #Test scenario, when user create project, fit models, and try to once
        #again run project. In this case, there will be no additional computations,
        #all models will be simply retrived from existing project.

        model = Mljar(project=self.proj_title,
                      experiment=self.expt_title,
                      algorithms=['xgb'],
                      metric='logloss',
                      validation_kfolds=3,
                      tuning_mode='Normal',
                      single_algorithm_time_limit=1)
        self.assertTrue(model is not None)
        # fit models and wait till all models are trained
        model.fit(X=self.X, y=self.y)
        # run prediction
        pred = model.predict(self.X)
        # get MSE
        score = self.mse(pred, self.y)
        self.assertTrue(score < 0.1)

        # re-use already trained models
        # call fit but models are already trained
        # should be retrived - this should not be longer than 3 minutes
        start_time = time.time()
        model.fit(X=self.X, y=self.y)
        end_time = time.time()
        self.assertTrue(end_time - start_time < 3 * 60)
        # check prediction
        pred = model.predict(self.X)
        # get MSE
        score_2 = self.mse(pred, self.y)
        self.assertTrue(score_2 < 0.1)
        # scores should be the same
        self.assertTrue(np.abs(score - score_2) < 1e-3)

        # re-use project
        start_time = time.time()
        model_2 = Mljar(project=self.proj_title,
                        experiment=self.expt_title,
                        algorithms=['xgb'],
                        metric='logloss',
                        validation_kfolds=3,
                        tuning_mode='Normal',
                        single_algorithm_time_limit=1)
        self.assertTrue(model_2 is not None)
        # re-use trained models
        model_2.fit(X=self.X, y=self.y)
        end_time = time.time()
        # it should not take longer than 5 minutes
        self.assertTrue(end_time - start_time < 5 * 60)
        # run prediction
        pred = model_2.predict(self.X)
        # get MSE
        score_3 = self.mse(pred, self.y)
        self.assertTrue(score_3 < 0.1)
        # scores should be the same
        self.assertTrue(np.abs(score - score_3) < 1e-3)
Ejemplo n.º 2
0
    def test_compute_prediction(self):
        model = Mljar(project=self.proj_title,
                      experiment=self.expt_title,
                      algorithms=['rfc'],
                      metric='logloss',
                      validation_kfolds=3,
                      tuning_mode='Normal',
                      single_algorithm_time_limit=1)
        self.assertTrue(model is not None)
        # fit models and wait till all models are trained
        model.fit(X=self.X, y=self.y, dataset_title='My dataset')

        # get project id
        project_id = model.project.hid
        # get model id
        model_id = model.selected_algorithm.hid

        dc = DatasetClient(project_id)
        init_datasets_cnt = len(dc.get_datasets())
        # compute predictions
        pred = Mljar.compute_prediction(self.X, model_id, project_id)
        # compute score
        score = self.mse(pred, self.y)
        self.assertTrue(score < 0.9)
        # check if dataset was removed
        self.assertEqual(init_datasets_cnt, len(dc.get_datasets()))
        # run predictions again, but keep dataset
        pred = Mljar.compute_prediction(self.X,
                                        model_id,
                                        project_id,
                                        keep_dataset=True)
        self.assertEqual(init_datasets_cnt + 1,
                         len(dc.get_datasets()))  # should be one more
Ejemplo n.º 3
0
    def test_usage_with_validation_dataset(self):

        #Test usage with validation dataset.

        model = Mljar(project=self.proj_title,
                      experiment=self.expt_title,
                      algorithms=['xgb'],
                      tuning_mode='Normal',
                      single_algorithm_time_limit=1)
        self.assertTrue(model is not None)
        # load validation data
        df = pd.read_csv('tests/data/test_1_vald.csv')
        cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
        target = 'class'
        X_vald = df[cols]
        y_vald = df[target]
        # fit models and wait till all models are trained
        model.fit(X=self.X,
                  y=self.y,
                  validation_data=(X_vald, y_vald),
                  wait_till_all_done=False)
        # wait some time
        time.sleep(80)
        # run prediction
        pred = model.predict(self.X)
        # get MSE
        score = self.mse(pred, self.y)
        self.assertTrue(score < 0.9)
        # check default validation
        self.assertEqual(model.selected_algorithm.validation_scheme,
                         "With dataset")
Ejemplo n.º 4
0
 def test_wrong_input_dim(self):
     with self.assertRaises(IncorrectInputDataException) as context:
         model = Mljar(project=self.proj_title, experiment=self.expt_title)
         samples = 100
         columns = 10
         X = np.random.rand(samples, columns)
         y = np.random.choice([0, 1], samples + 1, replace=True)
         model.fit(X, y)
Ejemplo n.º 5
0
def mljar_compute(X_train, y_train, X_test, dataset_id, seed):
    from mljar import Mljar
    #os.environ['MLJAR_TOKEN'] = '' # set token here or in your env
    if 'MLJAR_TOKEN' not in os.environ:
        raise Exception('Missing MLJAR_TOKEN, please set it.')

    mlj = Mljar('DatasetId_{0}'.format(dataset_id),
                'Seed_{0}'.format(seed),
                metric = 'logloss',
                algorithms = ['xgb', 'lgb', 'rfc', 'etc', 'mlp'],
                tuning_mode = 'Sport',
                create_ensemble  = True,
                single_algorithm_time_limit = 10)

    mlj.fit(X_train, y_train)
    response = mlj.predict(X_test)
    return response
Ejemplo n.º 6
0
 def test_usage_with_defaults(self):
     '''
     Test usage with defaults.
     '''
     model = Mljar(project=self.proj_title, experiment=self.expt_title)
     self.assertTrue(model is not None)
     # fit models and wait till all models are trained
     model.fit(X=self.X, y=self.y, wait_till_all_done=False)
     # wait some time
     time.sleep(120)  # wait a little longer - there are a lot of models
     # run prediction
     pred = model.predict(self.X)
     # get MSE
     score = self.mse(pred, self.y)
     self.assertTrue(score < 0.5)
     # check default validation
     self.assertEqual(model.selected_algorithm.validation_scheme,
                      "5-fold CV, Shuffle, Stratify")
Ejemplo n.º 7
0
 def test_basic_usage(self):
     '''
     Test the most common usage.
     '''
     model = Mljar(project=self.proj_title,
                   experiment=self.expt_title,
                   algorithms=['xgb'],
                   metric='logloss',
                   validation_kfolds=3,
                   tuning_mode='Normal')
     self.assertTrue(model is not None)
     # fit models and wait till all models are trained
     model.fit(X=self.X, y=self.y)
     # run prediction
     pred = model.predict(self.X)
     # get MSE
     score = self.mse(pred, self.y)
     self.assertTrue(score < 0.1)
Ejemplo n.º 8
0
def main():
    # login to numerai to get token
    print 'Login into Numer.ai'
    api = numerapi.NumerAPI(NUMERAI_USER, NUMERAI_PASS)
    # get datasets
    print 'Get dataset'
    if not os.path.isfile(TRAIN_FNAME):
        api.download_current_dataset(dest_path='.', unzip=True)
    # read datasets
    train = pd.read_csv(TRAIN_FNAME)
    test = pd.read_csv(TEST_FNAME)
    print 'Numer.ai data downloaded'
    print 'Train shape', train.shape, 'test shape', test.shape

    X_train = train[train.columns[:50]]
    y_train = train['target']
    X_test = test

    print 'Create MLJAR project and experiment'
    models = Mljar(
        project='Auto-trading',
        experiment="Raw data",
        metric='logloss',
        validation_kfolds=5,  # we will use 5-fold CV with stratify and shuffle
        validation_shuffle=True,
        validation_stratify=True,
        algorithms=['xgb', 'lgb',
                    'mlp'],  # select Xgboost, LightGBM and Neural Network
        tuning_mode=
        'Normal',  # number of models to be checked for each algorithm
        single_algorithm_time_limit=5)  # 5 minutes for training single model

    print 'Train models:'
    # fit models - that's all, only one line of code ;)
    models.fit(X_train, y_train)
    # get predictions on test data
    predictions = models.predict(X_test)
    # save predictions to file
    predictions.to_csv(PREDICTIONS_FNAME, index=False)
    result = api.upload_prediction(PREDICTIONS_FNAME)
    print 'Your score:', result['submission']['accuracy_score']
Ejemplo n.º 9
0
 def test_usage_with_train_split(self):
     '''
     Test usage with train split.
     '''
     model = Mljar(project=self.proj_title,
                   experiment=self.expt_title,
                   validation_train_split=0.8,
                   algorithms=['xgb'],
                   tuning_mode='Normal')
     self.assertTrue(model is not None)
     # fit models and wait till all models are trained
     model.fit(X=self.X, y=self.y, wait_till_all_done=False)
     # wait some time
     time.sleep(60)
     # run prediction
     pred = model.predict(self.X)
     # get MSE
     score = self.mse(pred, self.y)
     self.assertTrue(score < 0.5)
     # check default validation
     self.assertEqual(model.selected_algorithm.validation_scheme,
                      "Split 80/20, Shuffle, Stratify")
Ejemplo n.º 10
0
    def test_non_wait_fit(self):
        '''
        Test the non wait fit.
        '''
        model = Mljar(project=self.proj_title,
                      experiment=self.expt_title,
                      algorithms=['xgb'],
                      metric='logloss',
                      validation_kfolds=3,
                      tuning_mode='Normal',
                      single_algorithm_time_limit=1)
        self.assertTrue(model is not None)
        # fit models, just start computation and do not wait
        start_time = time.time()
        model.fit(X=self.X, y=self.y, wait_till_all_done=False)
        end_time = time.time()
        # time to initialize models should not be greater than 5 minutes
        self.assertTrue(end_time - start_time < 5 * 60)
        # run prediction
        # good model is not guaranteed
        # but there should be at least one
        max_trys = 50
        pred = None
        while True:
            pred = model.predict(self.X)
            if pred is None:
                # there is no model ready, please wait
                time.sleep(10)
            else:
                break
            max_trys -= 1
            if max_trys <= 0:
                break

        self.assertTrue(pred is not None)
        # get MSE
        score = self.mse(pred, self.y)
        print 'Score', score
        self.assertTrue(score < 0.99)
Ejemplo n.º 11
0
                      random_state=0)
    with open(fnCache, 'wb') as handle:
        pickle.dump((X, y), handle)

print("X", X.transpose())
print("y", y)

########################
print("MLJAR Random forest classification")
validation_kfolds = 5
clf_mlj = Mljar(
    project='Recon mljar-sklearn',
    experiment='Ex 3.1',  # use ex2 cached data, but with 5-fold cross-validation
    metric='auc',
    algorithms=['rfc'],
    validation_kfolds=validation_kfolds,
    validation_shuffle=False,
    validation_stratify=True,
    validation_train_split=None,
    tuning_mode='Normal',
    create_ensemble=False,
    single_algorithm_time_limit='1')

print("fit")
clf_mlj.fit(X, y)  #,dataset_title="sklearn make_blobs + k-fold")

# Until https://github.com/mljar/mljar-api-python/issues/2
# gets fixed,
# manually get the result to use in the prediction
# by browsing mljar.com
from mljar.client.result import ResultClient
client = ResultClient(clf_mlj.project.hid)
# Show first lines of our dataset
df.head()

# First 14 columns will be input for classifier and the last one will be a target
input_cols = df.columns[:14]
target_col = df.columns[14]

# Let's define MLJAR project
mljar = Mljar(
    project=
    'UCI-Adult',  # project's title that we will use to find it among our projects in MLJAR
    experiment='Try tree methods',  # experiment's title
    validation_kfolds=5,  # we will use 5-fold CV with stratify and shuffle
    validation_shuffle=True,
    validation_stratify=True,
    metric='auc',  # we will use Area Under ROC Curve (AUC) as a metric,
    tuning_mode='Normal',  # select tuning mode
    algorithms=[
        'rfc', 'xgb', 'lgb'
    ],  # we want to tune Random Forest, LightGBM and Xgboost models
    create_ensemble=True,  # create ensemble of all models,
    single_algorithm_time_limit=
    5  # time limit for single algorithm training in minutes
)

# Run models prediction
mljar.fit(df[input_cols], df[target_col])

# Print out the most useful algorithm
print str(mljar.selected_algorithm)

# Run prediction on train dataset just for example
Ejemplo n.º 13
0
 def test_predict_without_fit(self):
     # Call predict without calling first fit method should return None
     model = Mljar(project=self.proj_title, experiment=self.expt_title)
     pred = model.predict(self.X)
     self.assertTrue(pred is None)
Ejemplo n.º 14
0
 def test_default_tuning_mode(self):
     model = Mljar(project=self.proj_title, experiment=self.expt_title)
     self.assertEqual(model.tuning_mode, MLJAR_DEFAULT_TUNING_MODE)
Ejemplo n.º 15
0
 def test_wrong_tuning_mode(self):
     with self.assertRaises(BadValueException) as context:
         model = Mljar(project=self.proj_title,
                       experiment=self.expt_title,
                       tuning_mode='Crazy')
Ejemplo n.º 16
0
 def test_empty_project_title(self):
     with self.assertRaises(BadValueException) as context:
         model = Mljar(project='', experiment='')
    with open(fnCache, 'wb') as handle:
        pickle.dump((X, y), handle)

print("X", X.transpose())
print("y", y)

########################
print("MLJAR Random forest classification")
clf_mlj = Mljar(
    project='Recon mljar-sklearn',
    # experiment='Ex 2.1', # validation_train_split=0.05
    # experiment='Ex 2.2', # manual through website
    # experiment='Ex 2.3', # validation_train_split=0.95
    experiment=
    'Ex 2.4',  # cache the data locally at the same time as uploading to be able to replicate results
    metric='auc',
    algorithms=['rfc'],
    validation_kfolds=None,
    validation_shuffle=False,
    validation_stratify=True,
    validation_train_split=0.95,
    tuning_mode='Normal',
    create_ensemble=False,
    single_algorithm_time_limit='1')

print("fit")
clf_mlj.fit(X, y)  #,dataset_title="sklearn make_blobs")

# from mljar.client.project import ProjectClient
# clf_mlj.project = ProjectClient().create_project_if_not_exists(clf_mlj.project_title, clf_mlj.project_task)

# Until https://github.com/mljar/mljar-api-python/issues/2
Ejemplo n.º 18
0
# X[half:n_samples,1] = 1
y = X[:, 0]

print("X", X.transpose())
print("y", y)

########################
print("MLJAR Random forest classification")
clf_mlj = Mljar(
    project='Recon mljar-sklearn',
    # experiment='Ex 1.1', # for n_samples = 50
    # experiment='Ex 1.2', # for n_samples = 100, and 2 classes: 0, 1
    # experiment='Ex 1.3', # for n_samples = 100, and 2 classes: 0, 2
    # experiment='Ex 1.4', # for n_samples = 100, and 2 classes: 0, 2, and just re-run anew for probabilities
    experiment='Ex 1.5',  # as 1.4, but with validation_train_split=0.95
    metric='auc',
    algorithms=['rfc'],
    validation_kfolds=None,
    validation_shuffle=False,
    validation_stratify=True,
    validation_train_split=0.95,
    tuning_mode='Normal',  # Used Sport for experiments 1-3
    create_ensemble=False,
    single_algorithm_time_limit='1')

print("fit")
clf_mlj.fit(X, y)  #,dataset_title="Ones and zeros")

print("predict")
pred_proba_mlj = clf_mlj.predict(X)
pred_proba_mlj = pred_proba_mlj.squeeze().values
print("pred_proba_mlj", pred_proba_mlj)  # shows values = 0 or 1