def _get_data(self, ml_repo: MLRepo): data = [] if self.data is None: data.extend(ml_repo.get_names(MLObjectType.TEST_DATA)) data.extend(ml_repo.get_names(MLObjectType.TRAINING_DATA)) else: data = self.data return data
def _get_models(self, ml_repo: MLRepo): models_test = defaultdict(set) if self.models is None: tmp = ml_repo.get_names(MLObjectType.CALIBRATED_MODEL) for k in tmp: m = ml_repo.get(k, full_object=False) models_test[k].add(m.repo_info[RepoInfoKey.VERSION]) else: for k, v in self.models.items(): models_test[k].add(v) if self.labels is None: labels = ml_repo.get_names(MLObjectType.LABEL) else: labels = self.labels for l in labels: tmp = ml_repo.get(l) models_test[tmp.name].add(tmp.version) return models_test
def _get_measure_types(self, ml_repo: MLRepo, reg_test=None): if reg_test is None: reg_test = ml_repo.get(self.test_definition, version=LAST_VERSION) measure_types = reg_test.measures if measure_types is None: tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION) if len(tmp) == 0: raise Exception( 'No regression test possible since no measure defined.') m_config = ml_repo.get(tmp[0], version=LAST_VERSION) measure_types = [ MeasureConfiguration.get_name(x) for k, x in m_config.measures.items() ] return measure_types
def _check(self, ml_repo: MLRepo): # check if test is based on latest test definition regression_test = ml_repo.get(self.test_definition, version=LAST_VERSION) if regression_test.repo_info.version != self.repo_info.modification_info[ self.test_definition]: return 'Test is not based on latest test definition, latest version: ' + regression_test.repo_info.version + ', version used for test: ' + self.modification_info[ self.test_definition] # check if measure config did not change if regression_test.measures is None: tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION) if len(tmp) == 0: raise Exception('No check possible since no measure defined.') m_config = ml_repo.get(tmp[0], version=LAST_VERSION) if m_config.repo_info.version != self.repo_info.modification_info[ m_config.repo_info.name]: return 'Test is not based on latest measure configuration, latest version: ' + m_config.repo_info.version + ', version used for test: ' + self.modification_info[ m_config.repo_info.name] # check if ref model did not change label = ml_repo.get(regression_test.reference, version=LAST_VERSION) if not label.repo_info.name in self.repo_info.modification_info.keys(): return 'Test on different reference model.' if not label.repo_info.version == self.repo_info.modification_info[ label.repo_info.name]: return 'Test on old reference model.' # check if test was on latest data version if not self.data in self.repo_info.modification_info.keys(): return 'Data of test has changed since last test.' version = self.data_version if version == LAST_VERSION: version = ml_repo._ml_repo.get_latest_version(self.data) elif version == FIRST_VERSION: version = ml_repo._ml_repo.get_first_version(self.data) if not version == self.repo_info.modification_info[self.data]: return 'Data of test has changed since last test.' return None
def test_tutorial(self): # cleanup disk before running repo_path = './tmp_tutorial' try: shutil.rmtree(repo_path) # os.path. except OSError: pass # creating in memory storage ml_repo = MLRepo(user='******') # end creating in memory storage # creating new repository config = {'user': '******', 'workspace': repo_path, 'repo_store': { 'type': 'disk_handler', 'config': { 'folder': repo_path, 'file_format': 'pickle' } }, 'numpy_store': { 'type': 'hdf_handler', 'config': { 'folder': repo_path, 'version_files': True } }, 'job_runner': { 'type': 'simple', 'config': { 'throw_job_error': True } } } ml_repo = MLRepo(user='******', config=config) # end creating new repository # specifying job runner job_runner = SimpleJobRunner(None) job_runner.set_repo(ml_repo) ml_repo._job_runner = job_runner # end specifying job runner job_runner._throw_job_error = True from pailab.tools.tree import MLTree MLTree.add_tree(ml_repo) # A convenient way to add RawData is simply to use the method add on the raw_data collection. # This method just takes a pandas dataframe and the specification, which columns belong to the input # and which to the targets. try: # read pandas import pandas as pd data = pd.read_csv('./examples/boston_housing/housing.csv') # end read pandas except: data = pd.read_csv('../examples/boston_housing/housing.csv') # extract data input_variables = ['RM', 'LSTAT', 'PTRATIO'] target_variables = ['MEDV'] x = data.loc[:, input_variables].values y = data.loc[:, target_variables].values # end extract data # add RawData snippet from pailab import RawData, RepoInfoKey raw_data = RawData(x, input_variables, y, target_variables, repo_info={ RepoInfoKey.NAME: 'raw_data/boston_housing'}) ml_repo.add(raw_data) # end adding RawData snippet # ml_repo.tree.raw_data.add('boston_housing', data, input_variables=[ # 'RM', 'LSTAT', 'PTRATIO'], target_variables=['MEDV']) # add DataSet # create DataSet objects for training and test data training_data = DataSet('raw_data/boston_housing', 0, 300, repo_info={RepoInfoKey.NAME: 'training_data', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA}) test_data = DataSet('raw_data/boston_housing', 301, None, repo_info={RepoInfoKey.NAME: 'test_data', RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA}) # add the objects to the repository version_list = ml_repo.add( [training_data, test_data], message='add training and test data') # end adding DataSet # add model import pailab.externals.sklearn_interface as sklearn_interface from sklearn.tree import DecisionTreeRegressor sklearn_interface.add_model( ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5}) # end adding model # run training job_id = ml_repo.run_training() # end running training # run evaluation job_id = ml_repo.run_evaluation() # end running evaluation # add measures snippet ml_repo.add_measure(MeasureConfiguration.MAX) ml_repo.add_measure(MeasureConfiguration.R2) # end add measure snippet # run measures snippet job_ids = ml_repo.run_measures() # end run measures snippet print(ml_repo.get_names(MLObjectType.MEASURE)) # get measures max_measure = ml_repo.get( 'DecisionTreeRegressor/measure/training_data/max') print(str(max_measure.value)) # end getting measures # label snippet from pailab import LAST_VERSION ml_repo.set_label('prod', 'DecisionTreeRegressor/model', model_version=LAST_VERSION, message='we found our first production model') # end label snippet # test definition snippet import pailab.tools.tests reg_test = pailab.tools.tests.RegressionTestDefinition( reference='prod', models=None, data=None, labels=None, measures=[MeasureConfiguration.MAX], tol=1000) reg_test.repo_info.name = 'reg_test' ml_repo.add(reg_test, message='regression test definition') # end test definition snippet # add test snippet tests = ml_repo.run_tests() # end add test snippet print(tests) # run check snippet import pailab.tools.checker as checker inconsistencies = checker.run(ml_repo) # end run check snippet print(inconsistencies) # add inconsistency snippet param = ml_repo.get('DecisionTreeRegressor/model_param') param.sklearn_params['max_depth'] = 2 version = ml_repo.add(param) # end add inconsistency snippet inconsistencies = checker.run(ml_repo) print(inconsistencies) ml_repo.run_training() inconsistencies = checker.run(ml_repo) print(inconsistencies) ml_repo.run_evaluation(run_descendants=True) print(checker.run(ml_repo)) # add second test data snippet test_data_2 = DataSet('raw_data/boston_housing', 0, 50, repo_info={RepoInfoKey.NAME: 'test_data_2', RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA} ) ml_repo.add(test_data_2) ml_repo.run_evaluation(run_descendants=True) # end add second test data snippet print(checker.Tests.run(ml_repo)) ml_repo.run_tests() # check tests print(checker.Tests.run(ml_repo)) # end check tests # cleanup after running # job_runner.close_connection() ml_repo._ml_repo.close_connection() try: shutil.rmtree(repo_path) # os.path. except OSError: pass