def test_compute_prediction(self): model = Mljar(project=self.proj_title, experiment=self.expt_title, algorithms=['rfc'], metric='logloss', validation_kfolds=3, tuning_mode='Normal', single_algorithm_time_limit=1) self.assertTrue(model is not None) # fit models and wait till all models are trained model.fit(X=self.X, y=self.y, dataset_title='My dataset') # get project id project_id = model.project.hid # get model id model_id = model.selected_algorithm.hid dc = DatasetClient(project_id) init_datasets_cnt = len(dc.get_datasets()) # compute predictions pred = Mljar.compute_prediction(self.X, model_id, project_id) # compute score score = self.mse(pred, self.y) self.assertTrue(score < 0.9) # check if dataset was removed self.assertEqual(init_datasets_cnt, len(dc.get_datasets())) # run predictions again, but keep dataset pred = Mljar.compute_prediction(self.X, model_id, project_id, keep_dataset=True) self.assertEqual(init_datasets_cnt + 1, len(dc.get_datasets())) # should be one more
def test_usage_with_validation_dataset(self): #Test usage with validation dataset. model = Mljar(project=self.proj_title, experiment=self.expt_title, algorithms=['xgb'], tuning_mode='Normal', single_algorithm_time_limit=1) self.assertTrue(model is not None) # load validation data df = pd.read_csv('tests/data/test_1_vald.csv') cols = ['sepal length', 'sepal width', 'petal length', 'petal width'] target = 'class' X_vald = df[cols] y_vald = df[target] # fit models and wait till all models are trained model.fit(X=self.X, y=self.y, validation_data=(X_vald, y_vald), wait_till_all_done=False) # wait some time time.sleep(80) # run prediction pred = model.predict(self.X) # get MSE score = self.mse(pred, self.y) self.assertTrue(score < 0.9) # check default validation self.assertEqual(model.selected_algorithm.validation_scheme, "With dataset")
def test_wrong_input_dim(self): with self.assertRaises(IncorrectInputDataException) as context: model = Mljar(project=self.proj_title, experiment=self.expt_title) samples = 100 columns = 10 X = np.random.rand(samples, columns) y = np.random.choice([0, 1], samples + 1, replace=True) model.fit(X, y)
def mljar_compute(X_train, y_train, X_test, dataset_id, seed): from mljar import Mljar #os.environ['MLJAR_TOKEN'] = '' # set token here or in your env if 'MLJAR_TOKEN' not in os.environ: raise Exception('Missing MLJAR_TOKEN, please set it.') mlj = Mljar('DatasetId_{0}'.format(dataset_id), 'Seed_{0}'.format(seed), metric = 'logloss', algorithms = ['xgb', 'lgb', 'rfc', 'etc', 'mlp'], tuning_mode = 'Sport', create_ensemble = True, single_algorithm_time_limit = 10) mlj.fit(X_train, y_train) response = mlj.predict(X_test) return response
def test_usage_with_defaults(self): ''' Test usage with defaults. ''' model = Mljar(project=self.proj_title, experiment=self.expt_title) self.assertTrue(model is not None) # fit models and wait till all models are trained model.fit(X=self.X, y=self.y, wait_till_all_done=False) # wait some time time.sleep(120) # wait a little longer - there are a lot of models # run prediction pred = model.predict(self.X) # get MSE score = self.mse(pred, self.y) self.assertTrue(score < 0.5) # check default validation self.assertEqual(model.selected_algorithm.validation_scheme, "5-fold CV, Shuffle, Stratify")
def test_basic_usage(self): ''' Test the most common usage. ''' model = Mljar(project=self.proj_title, experiment=self.expt_title, algorithms=['xgb'], metric='logloss', validation_kfolds=3, tuning_mode='Normal') self.assertTrue(model is not None) # fit models and wait till all models are trained model.fit(X=self.X, y=self.y) # run prediction pred = model.predict(self.X) # get MSE score = self.mse(pred, self.y) self.assertTrue(score < 0.1)
def main(): # login to numerai to get token print 'Login into Numer.ai' api = numerapi.NumerAPI(NUMERAI_USER, NUMERAI_PASS) # get datasets print 'Get dataset' if not os.path.isfile(TRAIN_FNAME): api.download_current_dataset(dest_path='.', unzip=True) # read datasets train = pd.read_csv(TRAIN_FNAME) test = pd.read_csv(TEST_FNAME) print 'Numer.ai data downloaded' print 'Train shape', train.shape, 'test shape', test.shape X_train = train[train.columns[:50]] y_train = train['target'] X_test = test print 'Create MLJAR project and experiment' models = Mljar( project='Auto-trading', experiment="Raw data", metric='logloss', validation_kfolds=5, # we will use 5-fold CV with stratify and shuffle validation_shuffle=True, validation_stratify=True, algorithms=['xgb', 'lgb', 'mlp'], # select Xgboost, LightGBM and Neural Network tuning_mode= 'Normal', # number of models to be checked for each algorithm single_algorithm_time_limit=5) # 5 minutes for training single model print 'Train models:' # fit models - that's all, only one line of code ;) models.fit(X_train, y_train) # get predictions on test data predictions = models.predict(X_test) # save predictions to file predictions.to_csv(PREDICTIONS_FNAME, index=False) result = api.upload_prediction(PREDICTIONS_FNAME) print 'Your score:', result['submission']['accuracy_score']
def test_retrive_models(self): #Test scenario, when user create project, fit models, and try to once #again run project. In this case, there will be no additional computations, #all models will be simply retrived from existing project. model = Mljar(project=self.proj_title, experiment=self.expt_title, algorithms=['xgb'], metric='logloss', validation_kfolds=3, tuning_mode='Normal', single_algorithm_time_limit=1) self.assertTrue(model is not None) # fit models and wait till all models are trained model.fit(X=self.X, y=self.y) # run prediction pred = model.predict(self.X) # get MSE score = self.mse(pred, self.y) self.assertTrue(score < 0.1) # re-use already trained models # call fit but models are already trained # should be retrived - this should not be longer than 3 minutes start_time = time.time() model.fit(X=self.X, y=self.y) end_time = time.time() self.assertTrue(end_time - start_time < 3 * 60) # check prediction pred = model.predict(self.X) # get MSE score_2 = self.mse(pred, self.y) self.assertTrue(score_2 < 0.1) # scores should be the same self.assertTrue(np.abs(score - score_2) < 1e-3) # re-use project start_time = time.time() model_2 = Mljar(project=self.proj_title, experiment=self.expt_title, algorithms=['xgb'], metric='logloss', validation_kfolds=3, tuning_mode='Normal', single_algorithm_time_limit=1) self.assertTrue(model_2 is not None) # re-use trained models model_2.fit(X=self.X, y=self.y) end_time = time.time() # it should not take longer than 5 minutes self.assertTrue(end_time - start_time < 5 * 60) # run prediction pred = model_2.predict(self.X) # get MSE score_3 = self.mse(pred, self.y) self.assertTrue(score_3 < 0.1) # scores should be the same self.assertTrue(np.abs(score - score_3) < 1e-3)
def test_usage_with_train_split(self): ''' Test usage with train split. ''' model = Mljar(project=self.proj_title, experiment=self.expt_title, validation_train_split=0.8, algorithms=['xgb'], tuning_mode='Normal') self.assertTrue(model is not None) # fit models and wait till all models are trained model.fit(X=self.X, y=self.y, wait_till_all_done=False) # wait some time time.sleep(60) # run prediction pred = model.predict(self.X) # get MSE score = self.mse(pred, self.y) self.assertTrue(score < 0.5) # check default validation self.assertEqual(model.selected_algorithm.validation_scheme, "Split 80/20, Shuffle, Stratify")
def test_non_wait_fit(self): ''' Test the non wait fit. ''' model = Mljar(project=self.proj_title, experiment=self.expt_title, algorithms=['xgb'], metric='logloss', validation_kfolds=3, tuning_mode='Normal', single_algorithm_time_limit=1) self.assertTrue(model is not None) # fit models, just start computation and do not wait start_time = time.time() model.fit(X=self.X, y=self.y, wait_till_all_done=False) end_time = time.time() # time to initialize models should not be greater than 5 minutes self.assertTrue(end_time - start_time < 5 * 60) # run prediction # good model is not guaranteed # but there should be at least one max_trys = 50 pred = None while True: pred = model.predict(self.X) if pred is None: # there is no model ready, please wait time.sleep(10) else: break max_trys -= 1 if max_trys <= 0: break self.assertTrue(pred is not None) # get MSE score = self.mse(pred, self.y) print 'Score', score self.assertTrue(score < 0.99)
# experiment='Ex 1.2', # for n_samples = 100, and 2 classes: 0, 1 # experiment='Ex 1.3', # for n_samples = 100, and 2 classes: 0, 2 # experiment='Ex 1.4', # for n_samples = 100, and 2 classes: 0, 2, and just re-run anew for probabilities experiment='Ex 1.5', # as 1.4, but with validation_train_split=0.95 metric='auc', algorithms=['rfc'], validation_kfolds=None, validation_shuffle=False, validation_stratify=True, validation_train_split=0.95, tuning_mode='Normal', # Used Sport for experiments 1-3 create_ensemble=False, single_algorithm_time_limit='1') print("fit") clf_mlj.fit(X, y) #,dataset_title="Ones and zeros") print("predict") pred_proba_mlj = clf_mlj.predict(X) pred_proba_mlj = pred_proba_mlj.squeeze().values print("pred_proba_mlj", pred_proba_mlj) # shows values = 0 or 1 pred_mlj = [2 if x == 1 else 0 for x in pred_proba_mlj] print("prediction mljar == actual", (pred_mlj == y).all()) # returns True # mljar_fit_params = {'max_features': 0.5, 'min_samples_split': 50, 'criterion': "gini", 'min_samples_leaf': 1} # mljar_fit_params = {'max_features': 0.7, 'min_samples_split': 4, 'criterion': "entropy", 'min_samples_leaf': 2} mljar_fit_params = clf_mlj.selected_algorithm.params['model_params'][ 'fit_params'] print("mljar_fit_params", mljar_fit_params) ########################
target_col = df.columns[14] # Let's define MLJAR project mljar = Mljar( project= 'UCI-Adult', # project's title that we will use to find it among our projects in MLJAR experiment='Try tree methods', # experiment's title validation_kfolds=5, # we will use 5-fold CV with stratify and shuffle validation_shuffle=True, validation_stratify=True, metric='auc', # we will use Area Under ROC Curve (AUC) as a metric, tuning_mode='Normal', # select tuning mode algorithms=[ 'rfc', 'xgb', 'lgb' ], # we want to tune Random Forest, LightGBM and Xgboost models create_ensemble=True, # create ensemble of all models, single_algorithm_time_limit= 5 # time limit for single algorithm training in minutes ) # Run models prediction mljar.fit(df[input_cols], df[target_col]) # Print out the most useful algorithm print str(mljar.selected_algorithm) # Run prediction on train dataset just for example pred = mljar.predict(df[input_cols]) print 'Please go to your mljar account and check details of all models'
# experiment='Ex 2.2', # manual through website # experiment='Ex 2.3', # validation_train_split=0.95 experiment= 'Ex 2.4', # cache the data locally at the same time as uploading to be able to replicate results metric='auc', algorithms=['rfc'], validation_kfolds=None, validation_shuffle=False, validation_stratify=True, validation_train_split=0.95, tuning_mode='Normal', create_ensemble=False, single_algorithm_time_limit='1') print("fit") clf_mlj.fit(X, y) #,dataset_title="sklearn make_blobs") # from mljar.client.project import ProjectClient # clf_mlj.project = ProjectClient().create_project_if_not_exists(clf_mlj.project_title, clf_mlj.project_task) # Until https://github.com/mljar/mljar-api-python/issues/2 # gets fixed, # manually get the result to use in the prediction # by browsing mljar.com from mljar.client.result import ResultClient client = ResultClient(clf_mlj.project.hid) results = client.get_results(clf_mlj.experiment.hid) # len(results) # returns 75 # results = client.get_results(None) # len(results) # also returns 75 rid = raw_input(