Esempio n. 1
0
 def _get_data(self, ml_repo: MLRepo):
     data = []
     if self.data is None:
         data.extend(ml_repo.get_names(MLObjectType.TEST_DATA))
         data.extend(ml_repo.get_names(MLObjectType.TRAINING_DATA))
     else:
         data = self.data
     return data
Esempio n. 2
0
 def _get_models(self, ml_repo: MLRepo):
     models_test = defaultdict(set)
     if self.models is None:
         tmp = ml_repo.get_names(MLObjectType.CALIBRATED_MODEL)
         for k in tmp:
             m = ml_repo.get(k, full_object=False)
             models_test[k].add(m.repo_info[RepoInfoKey.VERSION])
     else:
         for k, v in self.models.items():
             models_test[k].add(v)
     if self.labels is None:
         labels = ml_repo.get_names(MLObjectType.LABEL)
     else:
         labels = self.labels
     for l in labels:
         tmp = ml_repo.get(l)
         models_test[tmp.name].add(tmp.version)
     return models_test
Esempio n. 3
0
 def _get_measure_types(self, ml_repo: MLRepo, reg_test=None):
     if reg_test is None:
         reg_test = ml_repo.get(self.test_definition, version=LAST_VERSION)
     measure_types = reg_test.measures
     if measure_types is None:
         tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION)
         if len(tmp) == 0:
             raise Exception(
                 'No regression test possible since no measure defined.')
         m_config = ml_repo.get(tmp[0], version=LAST_VERSION)
         measure_types = [
             MeasureConfiguration.get_name(x)
             for k, x in m_config.measures.items()
         ]
     return measure_types
Esempio n. 4
0
 def _check(self, ml_repo: MLRepo):
     # check if test is based on latest test definition
     regression_test = ml_repo.get(self.test_definition,
                                   version=LAST_VERSION)
     if regression_test.repo_info.version != self.repo_info.modification_info[
             self.test_definition]:
         return 'Test is not based on latest test definition, latest version: ' + regression_test.repo_info.version + ', version used for test: ' + self.modification_info[
             self.test_definition]
     # check if measure config did not change
     if regression_test.measures is None:
         tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION)
         if len(tmp) == 0:
             raise Exception('No check possible since no measure defined.')
         m_config = ml_repo.get(tmp[0], version=LAST_VERSION)
         if m_config.repo_info.version != self.repo_info.modification_info[
                 m_config.repo_info.name]:
             return 'Test is not based on latest measure configuration, latest version: ' + m_config.repo_info.version + ', version used for test: ' + self.modification_info[
                 m_config.repo_info.name]
     #  check if ref model did not change
     label = ml_repo.get(regression_test.reference, version=LAST_VERSION)
     if not label.repo_info.name in self.repo_info.modification_info.keys():
         return 'Test on different reference model.'
     if not label.repo_info.version == self.repo_info.modification_info[
             label.repo_info.name]:
         return 'Test on old reference model.'
     # check if test was on latest data version
     if not self.data in self.repo_info.modification_info.keys():
         return 'Data of test has changed since last test.'
     version = self.data_version
     if version == LAST_VERSION:
         version = ml_repo._ml_repo.get_latest_version(self.data)
     elif version == FIRST_VERSION:
         version = ml_repo._ml_repo.get_first_version(self.data)
     if not version == self.repo_info.modification_info[self.data]:
         return 'Data of test has changed since last test.'
     return None
Esempio n. 5
0
    def test_tutorial(self):
        # cleanup disk before running
        repo_path = './tmp_tutorial'
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass

        # creating in memory storage
        ml_repo = MLRepo(user='******')
        # end creating in memory storage

        # creating new repository
        config = {'user': '******',
                  'workspace': repo_path,
                  'repo_store':
                  {
                      'type': 'disk_handler',
                      'config': {
                          'folder': repo_path,
                          'file_format': 'pickle'
                      }
                  },
                  'numpy_store':
                  {
                      'type': 'hdf_handler',
                      'config': {
                          'folder': repo_path,
                          'version_files': True
                      }
                  },
                  'job_runner':
                  {
                      'type': 'simple',
                      'config': {
                          'throw_job_error': True
                      }
                  }
                  }
        ml_repo = MLRepo(user='******', config=config)
        # end creating new repository
        # specifying job runner
        job_runner = SimpleJobRunner(None)
        job_runner.set_repo(ml_repo)
        ml_repo._job_runner = job_runner
        # end specifying job runner
        job_runner._throw_job_error = True

        from pailab.tools.tree import MLTree
        MLTree.add_tree(ml_repo)

        # A convenient way to add RawData is simply to use the method add on the raw_data collection.
        # This method just takes a pandas dataframe and the specification, which columns belong to the input
        # and which to the targets.

        try:
            # read pandas
            import pandas as pd
            data = pd.read_csv('./examples/boston_housing/housing.csv')
            # end read pandas
        except:
            data = pd.read_csv('../examples/boston_housing/housing.csv')

        # extract data
        input_variables = ['RM', 'LSTAT', 'PTRATIO']
        target_variables = ['MEDV']
        x = data.loc[:, input_variables].values
        y = data.loc[:, target_variables].values
        # end extract data

        # add RawData snippet
        from pailab import RawData, RepoInfoKey

        raw_data = RawData(x, input_variables, y, target_variables, repo_info={
                           RepoInfoKey.NAME: 'raw_data/boston_housing'})
        ml_repo.add(raw_data)

        # end adding RawData snippet
        # ml_repo.tree.raw_data.add('boston_housing', data, input_variables=[
        #    'RM', 'LSTAT', 'PTRATIO'], target_variables=['MEDV'])

        # add DataSet
        # create DataSet objects for training and test data
        training_data = DataSet('raw_data/boston_housing', 0, 300,
                                repo_info={RepoInfoKey.NAME: 'training_data', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA})
        test_data = DataSet('raw_data/boston_housing', 301, None,
                            repo_info={RepoInfoKey.NAME: 'test_data',  RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA})
        # add the objects to the repository
        version_list = ml_repo.add(
            [training_data, test_data], message='add training and test data')
        # end adding DataSet

        # add model
        import pailab.externals.sklearn_interface as sklearn_interface
        from sklearn.tree import DecisionTreeRegressor
        sklearn_interface.add_model(
            ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})
        # end adding model

        # run training
        job_id = ml_repo.run_training()
        # end running training

        # run evaluation
        job_id = ml_repo.run_evaluation()
        # end running evaluation

        # add measures snippet
        ml_repo.add_measure(MeasureConfiguration.MAX)
        ml_repo.add_measure(MeasureConfiguration.R2)
        # end add measure snippet

        # run measures snippet
        job_ids = ml_repo.run_measures()
        # end run measures snippet

        print(ml_repo.get_names(MLObjectType.MEASURE))

        # get measures
        max_measure = ml_repo.get(
            'DecisionTreeRegressor/measure/training_data/max')
        print(str(max_measure.value))
        # end getting measures

        # label snippet
        from pailab import LAST_VERSION
        ml_repo.set_label('prod', 'DecisionTreeRegressor/model',
                          model_version=LAST_VERSION, message='we found our first production model')
        # end label snippet

        # test definition snippet
        import pailab.tools.tests
        reg_test = pailab.tools.tests.RegressionTestDefinition(
            reference='prod', models=None, data=None, labels=None,
            measures=[MeasureConfiguration.MAX],  tol=1000)
        reg_test.repo_info.name = 'reg_test'
        ml_repo.add(reg_test, message='regression test definition')
        # end test definition snippet

        # add test snippet
        tests = ml_repo.run_tests()
        # end add test snippet
        print(tests)

        # run check snippet
        import pailab.tools.checker as checker
        inconsistencies = checker.run(ml_repo)
        # end run check snippet

        print(inconsistencies)

        # add inconsistency snippet
        param = ml_repo.get('DecisionTreeRegressor/model_param')
        param.sklearn_params['max_depth'] = 2
        version = ml_repo.add(param)
        # end add inconsistency snippet

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_training()

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_evaluation(run_descendants=True)

        print(checker.run(ml_repo))

        # add second test data snippet
        test_data_2 = DataSet('raw_data/boston_housing', 0, 50,
                              repo_info={RepoInfoKey.NAME: 'test_data_2',
                                         RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA}
                              )
        ml_repo.add(test_data_2)
        ml_repo.run_evaluation(run_descendants=True)
        # end add second test data snippet

        print(checker.Tests.run(ml_repo))

        ml_repo.run_tests()

        # check tests
        print(checker.Tests.run(ml_repo))
        # end check tests

        # cleanup after running
        # job_runner.close_connection()
        ml_repo._ml_repo.close_connection()
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass