def _get_data(self, ml_repo: MLRepo): data = [] if self.data is None: data.extend(ml_repo.get_names(MLObjectType.TEST_DATA)) data.extend(ml_repo.get_names(MLObjectType.TRAINING_DATA)) else: data = self.data return data
def _run_test(self, ml_repo: MLRepo, jobid): logger.debug('Running regression test ' + self.repo_info.name + ' on model ' + str(NamingConventions.CalibratedModel(self.model)) + ', version ' + self.model_version) regression_test = ml_repo.get(self.test_definition, version=LAST_VERSION) label = ml_repo.get(regression_test.reference, version=LAST_VERSION) result = {} measure_types = self._get_measure_types(ml_repo, regression_test) for measure_type in measure_types: measure_name = str( NamingConventions.Measure({ 'model': self.model.split('/')[0], 'data': self.data, 'measure_type': measure_type })) measure = ml_repo.get( measure_name, version=None, modifier_versions={ str(NamingConventions.CalibratedModel(self.model)): self.model_version, self.data: self.data_version }, throw_error_not_exist=False, throw_error_not_unique=True) if measure == []: continue measure_name = str( NamingConventions.Measure({ 'model': label.name.split('/')[0], 'data': self.data, 'measure_type': measure_type })) reference_value = ml_repo.get( measure_name, version=None, modifier_versions={ str(NamingConventions.CalibratedModel(label.name)): label.version, self.data: self.data_version }, adjust_modification_info=False) if regression_test.relative: if measure.value - reference_value.value < regression_test.tol * reference_value.value: result[measure_type] = { 'reference_value': reference_value.value, 'value': measure.value } else: if measure.value - reference_value.value < regression_test.tol: result[measure_type] = { 'reference_value': reference_value.value, 'value': measure.value } return result
def test_regression_test(self): """Test the regression test framework """ repo = MLRepo(user='******') model_1 = TestClass(1.0, 2.0, repo_info={'name': 'model'}) model_1.repo_info.category = MLObjectType.CALIBRATED_MODEL.value model_version = repo.add(model_1) measure_1 = repo_objects.Measure(1.0, repo_info={ 'name': 'model/measure/test_data/max', 'modification_info': {'model'} })
def _get_measure_types(self, ml_repo: MLRepo, reg_test=None): if reg_test is None: reg_test = ml_repo.get(self.test_definition, version=LAST_VERSION) measure_types = reg_test.measures if measure_types is None: tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION) if len(tmp) == 0: raise Exception( 'No regression test possible since no measure defined.') m_config = ml_repo.get(tmp[0], version=LAST_VERSION) measure_types = [ MeasureConfiguration.get_name(x) for k, x in m_config.measures.items() ] return measure_types
def _get_models(self, ml_repo: MLRepo): models_test = defaultdict(set) if self.models is None: tmp = ml_repo.get_names(MLObjectType.CALIBRATED_MODEL) for k in tmp: m = ml_repo.get(k, full_object=False) models_test[k].add(m.repo_info[RepoInfoKey.VERSION]) else: for k, v in self.models.items(): models_test[k].add(v) if self.labels is None: labels = ml_repo.get_names(MLObjectType.LABEL) else: labels = self.labels for l in labels: tmp = ml_repo.get(l) models_test[tmp.name].add(tmp.version) return models_test
def test_default_constructor(self): #example with default ml_repo = MLRepo(user='******') #end example with default # If on of these test fail asince the logic has been modified, please update the documentation in basics.rst self.assertTrue( isinstance(ml_repo._ml_repo, memory_handler.RepoObjectMemoryStorage)) self.assertTrue( isinstance(ml_repo._numpy_repo, memory_handler.NumpyMemoryStorage)) self.assertTrue(isinstance(ml_repo._job_runner, SimpleJobRunner))
def test_config_disk_handler(self): #diskhandlerconfig config = { 'user': '******', 'workspace': 'tmp', 'repo_store': { 'type': 'disk_handler', 'config': { 'folder': 'tmp/objects', 'file_format': 'json' } }, 'numpy_store': { 'type': 'hdf_handler', 'config': { 'folder': 'tmp/repo_data', 'version_files': True } }, 'job_runner': { 'type': 'simple', 'config': {} } } # end diskhandlerconfig # instantiate diskhandler ml_repo = MLRepo(config=config) # end instantiate diskhandler # instantiate diskhandler save config ml_repo = MLRepo(config=config, save_config=True) # end instantiate diskhandler save config # instantiate with workspace ml_repo = MLRepo(workspace='tmp')
def _check(self, ml_repo: MLRepo): # check if test is based on latest test definition regression_test = ml_repo.get(self.test_definition, version=LAST_VERSION) if regression_test.repo_info.version != self.repo_info.modification_info[ self.test_definition]: return 'Test is not based on latest test definition, latest version: ' + regression_test.repo_info.version + ', version used for test: ' + self.modification_info[ self.test_definition] # check if measure config did not change if regression_test.measures is None: tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION) if len(tmp) == 0: raise Exception('No check possible since no measure defined.') m_config = ml_repo.get(tmp[0], version=LAST_VERSION) if m_config.repo_info.version != self.repo_info.modification_info[ m_config.repo_info.name]: return 'Test is not based on latest measure configuration, latest version: ' + m_config.repo_info.version + ', version used for test: ' + self.modification_info[ m_config.repo_info.name] # check if ref model did not change label = ml_repo.get(regression_test.reference, version=LAST_VERSION) if not label.repo_info.name in self.repo_info.modification_info.keys(): return 'Test on different reference model.' if not label.repo_info.version == self.repo_info.modification_info[ label.repo_info.name]: return 'Test on old reference model.' # check if test was on latest data version if not self.data in self.repo_info.modification_info.keys(): return 'Data of test has changed since last test.' version = self.data_version if version == LAST_VERSION: version = ml_repo._ml_repo.get_latest_version(self.data) elif version == FIRST_VERSION: version = ml_repo._ml_repo.get_first_version(self.data) if not version == self.repo_info.modification_info[self.data]: return 'Data of test has changed since last test.' return None
def test_repo_RawData(self): """Test RawData within repo """ repository = MLRepo(user='******') job_runner = SimpleJobRunner(repository) repository._job_runner = job_runner raw_data = repo_objects.RawData( np.zeros([10, 1]), ['test_coord'], repo_info={ # pylint: disable=E0602 repo_objects.RepoInfoKey.NAME.value: 'RawData_Test' }) repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA) raw_data_2 = repository.get('RawData_Test') self.assertEqual(len(raw_data.x_coord_names), len(raw_data_2.x_coord_names)) self.assertEqual(raw_data.x_coord_names[0], raw_data_2.x_coord_names[0]) commits = repository.get_commits() self.assertEqual(len(commits), 1) self.assertEqual(len(commits[0].objects), 1)
'bigobj_handler', type=str, help='definition of store for the big object parts of the repo objects' ) parser.add_argument( '-u', '--user', type=str, help= 'username used internally, if not specified, the operating system user is used' ) args = parser.parse_args() default_config = _get_default_config() default_config['workspace'] = args.workspace if args.user: default_config['user'] = args.user else: default_config['user'] = getpass.getuser() default_config['numpy_store']['config']['folder'] = args.workspace + '/hdf' default_config['repo_store']['config'][ 'folder'] = args.workspace + '/objects' _set_numpy_store_config(args.bigobj_handler, default_config) tmp = args.obj_handler.split('@') default_config['repo_store']['type'] = tmp[0] if tmp[0] == 'git_handler': if len(tmp) > 1: default_config['repo_store']['config']['remote'] = tmp[1] #print('Creating repo with config: ' + str(default_config)) ml_repo = MLRepo(config=default_config, save_config=True)
def test_integration(self): ml_repo = MLRepo(user='******') x = np.zeros([10, 1]) y = np.zeros([10]) # define eval and train ml_repo.add_eval_function(train, repo_name='my_eval_func') ml_repo.add_training_function(eval, repo_name='my_eval_func') # end define eval and train # define add training parameter training_param = SuperMLTrainingParam() training_param.median = True ml_repo.add( training_param, message= 'my first training parameter for my own super ml algorithm') # end define add training parameter # add own model ml_repo.add_model('my_model') # end add own model ml_repo.run_training()
def test_repo_training_test_data(self): # init repository with sample in memory handler repository = MLRepo(user='******') job_runner = SimpleJobRunner(repository) repository._job_runner = job_runner training_data = RawData( np.zeros([10, 1]), ['x_values'], np.zeros([10, 1]), ['y_values'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'}) repository.add(training_data, category=MLObjectType.TRAINING_DATA) training_data_2 = repository.get_training_data() self.assertEqual( training_data_2.repo_info[repo_objects.RepoInfoKey.NAME], training_data.repo_info[repo_objects.RepoInfoKey.NAME]) test_data = RawData( np.zeros([10, 1]), ['x_values'], np.zeros([10, 1]), ['y_values'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'}) repository.add(test_data, category=MLObjectType.TEST_DATA) test_data_ref = repository.get('test_data') self.assertEqual( test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME], test_data.repo_info[repo_objects.RepoInfoKey.NAME]) self.assertEqual( test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION], test_data.repo_info[repo_objects.RepoInfoKey.VERSION]) test_data_2 = RawData( np.zeros([10, 1]), ['x_values'], np.zeros([10, 1]), ['y_values'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'}) repository.add(test_data_2, category=MLObjectType.TEST_DATA) test_data_2_ref = repository.get('test_data_2') self.assertEqual( test_data_2.repo_info[repo_objects.RepoInfoKey.NAME], test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME]) commits = repository.get_commits() self.assertEqual(len(commits), 3) self.assertEqual(commits[1].objects['test_data'], test_data.repo_info.version) #self.assertEqual(commits[1].objects['repo_mapping'], 1) self.assertEqual(commits[2].objects['test_data_2'], test_data_2.repo_info.version)
def setUp(self): '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc. ''' self.repository = MLRepo(user='******') job_runner = SimpleJobRunner(self.repository) self.repository._job_runner = job_runner #### Setup dummy RawData raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) ## Setup dummy Test and Training DataSets on RawData training_data = DataSet('raw_1', 0, None, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'training_data_1', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA }) test_data_1 = DataSet('raw_2', 0, None, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'test_data_1', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) test_data_2 = DataSet('raw_3', 0, 2, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'test_data_2', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) self.repository.add([training_data, test_data_1, test_data_2]) ## setup dummy preprocessor self.repository.add_preprocessing_transforming_function( preprocessor_transforming_function_test, repo_name='transform_func') self.repository.add_preprocessing_fitting_function( preprocessor_fitting_function_test, repo_name='fit_func') self.repository.add_preprocessor('test_preprocessor_with_fitting', 'transform_func', 'fit_func', preprocessor_param=None) self.repository.add_eval_function(eval_func_test, 'eval_func') self.repository.add_training_function(train_func_test, 'train_func') self.repository.add( TestClass( 1, 2, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'training_param', # pylint: disable=E1123 repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_PARAM })) ## setup dummy model definition self.repository.add_model( 'model', 'eval_func', 'train_func', preprocessors=['test_preprocessor_with_fitting']) # setup measure configuration self._setup_measure_config() # add dummy calibrated model self._add_calibrated_model()
class RepoTest(unittest.TestCase): def _setup_measure_config(self): """Add a measure configuration with two measures (both MAX) where one measure just uses the coordinate x0 """ measure_config = repo_objects.MeasureConfiguration( [(repo_objects.MeasureConfiguration.MAX, ['y0']), repo_objects.MeasureConfiguration.MAX], repo_info={RepoInfoKey.NAME.value: 'measure_config'}) self.repository.add(measure_config, category=MLObjectType.MEASURE_CONFIGURATION, message='adding measure configuration') def _add_calibrated_model(self): self.repository.run_training() self.repository.set_label('prod') def setUp(self): '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc. ''' self.repository = MLRepo(user='******') job_runner = SimpleJobRunner(self.repository) self.repository._job_runner = job_runner #### Setup dummy RawData raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) ## Setup dummy Test and Training DataSets on RawData training_data = DataSet('raw_1', 0, None, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'training_data_1', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA }) test_data_1 = DataSet('raw_2', 0, None, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'test_data_1', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) test_data_2 = DataSet('raw_3', 0, 2, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'test_data_2', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) self.repository.add([training_data, test_data_1, test_data_2]) ## setup dummy preprocessor self.repository.add_preprocessing_transforming_function( preprocessor_transforming_function_test, repo_name='transform_func') self.repository.add_preprocessing_fitting_function( preprocessor_fitting_function_test, repo_name='fit_func') self.repository.add_preprocessor('test_preprocessor_with_fitting', 'transform_func', 'fit_func', preprocessor_param=None) self.repository.add_eval_function(eval_func_test, 'eval_func') self.repository.add_training_function(train_func_test, 'train_func') self.repository.add( TestClass( 1, 2, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'training_param', # pylint: disable=E1123 repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_PARAM })) ## setup dummy model definition self.repository.add_model( 'model', 'eval_func', 'train_func', preprocessors=['test_preprocessor_with_fitting']) # setup measure configuration self._setup_measure_config() # add dummy calibrated model self._add_calibrated_model() def test_adding_training_data_exception(self): '''Tests if adding new training data leads to an exception ''' with self.assertRaises(Exception): test_obj = DataSet('raw_data', repo_info={ repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA.value, 'name': 'test_object' }) self.repository.add(test_obj) def test_commit_increase_update(self): '''Check if updating an object in repository increases commit but does not change mapping ''' obj = self.repository.get('raw_1') old_num_commits = len(self.repository.get_commits()) old_version_mapping = self.repository.get('repo_mapping').repo_info[ RepoInfoKey.VERSION] self.repository.add(obj) new_num_commits = len(self.repository.get_commits()) new_version_mapping = self.repository.get('repo_mapping').repo_info[ RepoInfoKey.VERSION] self.assertEqual(old_num_commits + 1, new_num_commits) self.assertEqual(old_version_mapping, new_version_mapping) def test_commit_increase_add(self): '''Check if adding a new object in repository increases commit and does also change the mapping ''' obj = DataSet('raw_data_1', 0, None, repo_info={ RepoInfoKey.NAME.value: 'test...', RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) old_num_commits = len(self.repository.get_commits()) old_version_mapping = self.repository.get( 'repo_mapping').repo_info.version self.repository.add(obj) new_num_commits = len(self.repository.get_commits()) new_version_mapping = self.repository.get( 'repo_mapping').repo_info.version self.assertEqual(old_num_commits + 1, new_num_commits) commits = self.repository.get_commits() def test_DataSet_get(self): '''Test if getting a DataSet does include all informations from the underlying RawData (excluding numpy data) ''' obj = self.repository.get('test_data_1') raw_obj = self.repository.get(obj.raw_data) for i in range(len(raw_obj.x_coord_names)): self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i]) for i in range(len(raw_obj.y_coord_names)): self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i]) def test_DataSet_get_full(self): '''Test if getting a DataSet does include all informations from the underlying RawData (including numpy data) ''' obj = self.repository.get('test_data_1', version=repo_store.RepoStore.LAST_VERSION, full_object=True) raw_obj = self.repository.get( obj.raw_data, version=repo_store.RepoStore.LAST_VERSION, full_object=True) for i in range(len(raw_obj.x_coord_names)): self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i]) for i in range(len(raw_obj.y_coord_names)): self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i]) self.assertEqual(raw_obj.x_data.shape[0], obj.x_data.shape[0]) obj = self.repository.get('test_data_2', version=repo_store.RepoStore.LAST_VERSION, full_object=True) self.assertEqual(obj.x_data.shape[0], 2) obj = self.repository.get('training_data_1', version=repo_store.RepoStore.LAST_VERSION, full_object=True) self.assertEqual(obj.x_data.shape[0], 10) def test_repo_RawData(self): """Test RawData within repo """ repository = MLRepo(user='******') job_runner = SimpleJobRunner(repository) repository._job_runner = job_runner raw_data = repo_objects.RawData( np.zeros([10, 1]), ['test_coord'], repo_info={ # pylint: disable=E0602 repo_objects.RepoInfoKey.NAME.value: 'RawData_Test' }) repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA) raw_data_2 = repository.get('RawData_Test') self.assertEqual(len(raw_data.x_coord_names), len(raw_data_2.x_coord_names)) self.assertEqual(raw_data.x_coord_names[0], raw_data_2.x_coord_names[0]) commits = repository.get_commits() self.assertEqual(len(commits), 1) self.assertEqual(len(commits[0].objects), 1) def test_add_model_defaults(self): """test add_model using defaults to check whether default logic applies correctly """ model_param = TestClass(3, 4, repo_info={ RepoInfoKey.NAME.value: 'model_param', RepoInfoKey.CATEGORY: MLObjectType.MODEL_PARAM.value }) # pylint: disable=E1123 self.repository.add(model_param) training_param = TestClass(3, 4, repo_info={ RepoInfoKey.NAME.value: 'training_param', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_PARAM.value }) # pylint: disable=E1123 self.repository.add(training_param) self.repository.add_model('model1') model = self.repository.get('model1') self.assertEqual(model.eval_function, 'eval_func') self.assertEqual(model.training_function, 'train_func') self.assertEqual(model.training_param, 'training_param') self.assertEqual(model.model_param, 'model_param') def test_get_history(self): training_data_history = self.repository.get_history('training_data_1') self.assertEqual(len(training_data_history), 1) training_data = self.repository.get('training_data_1') self.repository.add(training_data) training_data_history = self.repository.get_history('training_data_1') self.assertEqual(len(training_data_history), 2) def test_run_eval_defaults(self): '''Test running evaluation with default arguments ''' self.repository.run_evaluation() def test_run_train_defaults(self): '''Test running training with default arguments ''' self.repository.run_training() def test_run_measure_defaults(self): self.repository.run_evaluation( ) # run first the evaluation so that there is at least one evaluation self.repository.run_measures() def test_repo_training_test_data(self): # init repository with sample in memory handler repository = MLRepo(user='******') job_runner = SimpleJobRunner(repository) repository._job_runner = job_runner training_data = RawData( np.zeros([10, 1]), ['x_values'], np.zeros([10, 1]), ['y_values'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'}) repository.add(training_data, category=MLObjectType.TRAINING_DATA) training_data_2 = repository.get_training_data() self.assertEqual( training_data_2.repo_info[repo_objects.RepoInfoKey.NAME], training_data.repo_info[repo_objects.RepoInfoKey.NAME]) test_data = RawData( np.zeros([10, 1]), ['x_values'], np.zeros([10, 1]), ['y_values'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'}) repository.add(test_data, category=MLObjectType.TEST_DATA) test_data_ref = repository.get('test_data') self.assertEqual( test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME], test_data.repo_info[repo_objects.RepoInfoKey.NAME]) self.assertEqual( test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION], test_data.repo_info[repo_objects.RepoInfoKey.VERSION]) test_data_2 = RawData( np.zeros([10, 1]), ['x_values'], np.zeros([10, 1]), ['y_values'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'}) repository.add(test_data_2, category=MLObjectType.TEST_DATA) test_data_2_ref = repository.get('test_data_2') self.assertEqual( test_data_2.repo_info[repo_objects.RepoInfoKey.NAME], test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME]) commits = repository.get_commits() self.assertEqual(len(commits), 3) self.assertEqual(commits[1].objects['test_data'], test_data.repo_info.version) #self.assertEqual(commits[1].objects['repo_mapping'], 1) self.assertEqual(commits[2].objects['test_data_2'], test_data_2.repo_info.version) #self.assertEqual(commits[2].objects['repo_mapping'], 2) def test_repo_RegressionTest(self): regression_test_def = ml_tests.RegressionTestDefinition( repo_info={ RepoInfoKey.NAME: 'regression_test', RepoInfoKey.CATEGORY: MLObjectType.TEST_DEFINITION.name }) tests = regression_test_def.create(self.repository) self.assertEqual(len(tests), 3) self.repository.add(regression_test_def) self.repository.run_evaluation() self.repository.run_measures() self.repository.run_tests() def test_add_multiple(self): """Test adding multiple objects at once """ obj1 = TestClass(5, 4, repo_info={}) obj1.repo_info.name = 'obj1' v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL) obj2 = TestClass(2, 3, repo_info={}) obj2.repo_info.name = 'obj2' self.repository.add([obj1, obj2], category=MLObjectType.CALIBRATED_MODEL) new_obj1 = self.repository.get('obj1') self.assertEqual(new_obj1.repo_info.name, 'obj1') new_obj2 = self.repository.get('obj2') self.assertEqual(new_obj2.repo_info.name, 'obj2') def test_delete(self): """Test if deletion works and if it considers if there are dependencies to other objects """ obj1 = TestClass(5, 4, repo_info={}) obj1.repo_info.name = 'obj1' v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL) obj2 = TestClass(2, 3, repo_info={}) obj2.repo_info.name = 'obj2' obj2.repo_info.modification_info = {'obj1': v1} v2 = self.repository.add(obj2, category=MLObjectType.CALIBRATED_MODEL) # check if an exception is thrown if one tries to delete obj1 although obj2 has # a dependency on obj1 try: self.repository.delete('obj1', v1) self.assertEqual(0, 1) except: pass # now first delete obj2 self.repository.delete('obj2', v2) # check if obj2 has really been deleted try: obj2 = self.repository.get('obj2') self.assertEqual(0, 1) except: pass #now, deletion of obj 1 should work try: self.repository.delete('obj1', v1) except: self.assertEqual(0, 1) try: #check if object really has been deleted obj1 = self.repository.get('obj1') self.assertEqual(0, 1) except: pass
def test_tutorial(self): # cleanup disk before running repo_path = './tmp_tutorial' try: shutil.rmtree(repo_path) # os.path. except OSError: pass # creating in memory storage ml_repo = MLRepo(user='******') # end creating in memory storage # creating new repository config = {'user': '******', 'workspace': repo_path, 'repo_store': { 'type': 'disk_handler', 'config': { 'folder': repo_path, 'file_format': 'pickle' } }, 'numpy_store': { 'type': 'hdf_handler', 'config': { 'folder': repo_path, 'version_files': True } }, 'job_runner': { 'type': 'simple', 'config': { 'throw_job_error': True } } } ml_repo = MLRepo(user='******', config=config) # end creating new repository # specifying job runner job_runner = SimpleJobRunner(None) job_runner.set_repo(ml_repo) ml_repo._job_runner = job_runner # end specifying job runner job_runner._throw_job_error = True from pailab.tools.tree import MLTree MLTree.add_tree(ml_repo) # A convenient way to add RawData is simply to use the method add on the raw_data collection. # This method just takes a pandas dataframe and the specification, which columns belong to the input # and which to the targets. try: # read pandas import pandas as pd data = pd.read_csv('./examples/boston_housing/housing.csv') # end read pandas except: data = pd.read_csv('../examples/boston_housing/housing.csv') # extract data input_variables = ['RM', 'LSTAT', 'PTRATIO'] target_variables = ['MEDV'] x = data.loc[:, input_variables].values y = data.loc[:, target_variables].values # end extract data # add RawData snippet from pailab import RawData, RepoInfoKey raw_data = RawData(x, input_variables, y, target_variables, repo_info={ RepoInfoKey.NAME: 'raw_data/boston_housing'}) ml_repo.add(raw_data) # end adding RawData snippet # ml_repo.tree.raw_data.add('boston_housing', data, input_variables=[ # 'RM', 'LSTAT', 'PTRATIO'], target_variables=['MEDV']) # add DataSet # create DataSet objects for training and test data training_data = DataSet('raw_data/boston_housing', 0, 300, repo_info={RepoInfoKey.NAME: 'training_data', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA}) test_data = DataSet('raw_data/boston_housing', 301, None, repo_info={RepoInfoKey.NAME: 'test_data', RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA}) # add the objects to the repository version_list = ml_repo.add( [training_data, test_data], message='add training and test data') # end adding DataSet # add model import pailab.externals.sklearn_interface as sklearn_interface from sklearn.tree import DecisionTreeRegressor sklearn_interface.add_model( ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5}) # end adding model # run training job_id = ml_repo.run_training() # end running training # run evaluation job_id = ml_repo.run_evaluation() # end running evaluation # add measures snippet ml_repo.add_measure(MeasureConfiguration.MAX) ml_repo.add_measure(MeasureConfiguration.R2) # end add measure snippet # run measures snippet job_ids = ml_repo.run_measures() # end run measures snippet print(ml_repo.get_names(MLObjectType.MEASURE)) # get measures max_measure = ml_repo.get( 'DecisionTreeRegressor/measure/training_data/max') print(str(max_measure.value)) # end getting measures # label snippet from pailab import LAST_VERSION ml_repo.set_label('prod', 'DecisionTreeRegressor/model', model_version=LAST_VERSION, message='we found our first production model') # end label snippet # test definition snippet import pailab.tools.tests reg_test = pailab.tools.tests.RegressionTestDefinition( reference='prod', models=None, data=None, labels=None, measures=[MeasureConfiguration.MAX], tol=1000) reg_test.repo_info.name = 'reg_test' ml_repo.add(reg_test, message='regression test definition') # end test definition snippet # add test snippet tests = ml_repo.run_tests() # end add test snippet print(tests) # run check snippet import pailab.tools.checker as checker inconsistencies = checker.run(ml_repo) # end run check snippet print(inconsistencies) # add inconsistency snippet param = ml_repo.get('DecisionTreeRegressor/model_param') param.sklearn_params['max_depth'] = 2 version = ml_repo.add(param) # end add inconsistency snippet inconsistencies = checker.run(ml_repo) print(inconsistencies) ml_repo.run_training() inconsistencies = checker.run(ml_repo) print(inconsistencies) ml_repo.run_evaluation(run_descendants=True) print(checker.run(ml_repo)) # add second test data snippet test_data_2 = DataSet('raw_data/boston_housing', 0, 50, repo_info={RepoInfoKey.NAME: 'test_data_2', RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA} ) ml_repo.add(test_data_2) ml_repo.run_evaluation(run_descendants=True) # end add second test data snippet print(checker.Tests.run(ml_repo)) ml_repo.run_tests() # check tests print(checker.Tests.run(ml_repo)) # end check tests # cleanup after running # job_runner.close_connection() ml_repo._ml_repo.close_connection() try: shutil.rmtree(repo_path) # os.path. except OSError: pass