Ejemplo n.º 1
0
 def _get_data(self, ml_repo: MLRepo):
     data = []
     if self.data is None:
         data.extend(ml_repo.get_names(MLObjectType.TEST_DATA))
         data.extend(ml_repo.get_names(MLObjectType.TRAINING_DATA))
     else:
         data = self.data
     return data
Ejemplo n.º 2
0
 def _run_test(self, ml_repo: MLRepo, jobid):
     logger.debug('Running regression test ' + self.repo_info.name +
                  ' on model ' +
                  str(NamingConventions.CalibratedModel(self.model)) +
                  ', version ' + self.model_version)
     regression_test = ml_repo.get(self.test_definition,
                                   version=LAST_VERSION)
     label = ml_repo.get(regression_test.reference, version=LAST_VERSION)
     result = {}
     measure_types = self._get_measure_types(ml_repo, regression_test)
     for measure_type in measure_types:
         measure_name = str(
             NamingConventions.Measure({
                 'model': self.model.split('/')[0],
                 'data': self.data,
                 'measure_type': measure_type
             }))
         measure = ml_repo.get(
             measure_name,
             version=None,
             modifier_versions={
                 str(NamingConventions.CalibratedModel(self.model)):
                 self.model_version,
                 self.data: self.data_version
             },
             throw_error_not_exist=False,
             throw_error_not_unique=True)
         if measure == []:
             continue
         measure_name = str(
             NamingConventions.Measure({
                 'model': label.name.split('/')[0],
                 'data': self.data,
                 'measure_type': measure_type
             }))
         reference_value = ml_repo.get(
             measure_name,
             version=None,
             modifier_versions={
                 str(NamingConventions.CalibratedModel(label.name)):
                 label.version,
                 self.data: self.data_version
             },
             adjust_modification_info=False)
         if regression_test.relative:
             if measure.value - reference_value.value < regression_test.tol * reference_value.value:
                 result[measure_type] = {
                     'reference_value': reference_value.value,
                     'value': measure.value
                 }
         else:
             if measure.value - reference_value.value < regression_test.tol:
                 result[measure_type] = {
                     'reference_value': reference_value.value,
                     'value': measure.value
                 }
     return result
Ejemplo n.º 3
0
 def test_regression_test(self):
     """Test the regression test framework
     """
     repo = MLRepo(user='******')
     model_1 = TestClass(1.0, 2.0, repo_info={'name': 'model'})
     model_1.repo_info.category = MLObjectType.CALIBRATED_MODEL.value
     model_version = repo.add(model_1)
     measure_1 = repo_objects.Measure(1.0,
                                      repo_info={
                                          'name':
                                          'model/measure/test_data/max',
                                          'modification_info': {'model'}
                                      })
Ejemplo n.º 4
0
 def _get_measure_types(self, ml_repo: MLRepo, reg_test=None):
     if reg_test is None:
         reg_test = ml_repo.get(self.test_definition, version=LAST_VERSION)
     measure_types = reg_test.measures
     if measure_types is None:
         tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION)
         if len(tmp) == 0:
             raise Exception(
                 'No regression test possible since no measure defined.')
         m_config = ml_repo.get(tmp[0], version=LAST_VERSION)
         measure_types = [
             MeasureConfiguration.get_name(x)
             for k, x in m_config.measures.items()
         ]
     return measure_types
Ejemplo n.º 5
0
 def _get_models(self, ml_repo: MLRepo):
     models_test = defaultdict(set)
     if self.models is None:
         tmp = ml_repo.get_names(MLObjectType.CALIBRATED_MODEL)
         for k in tmp:
             m = ml_repo.get(k, full_object=False)
             models_test[k].add(m.repo_info[RepoInfoKey.VERSION])
     else:
         for k, v in self.models.items():
             models_test[k].add(v)
     if self.labels is None:
         labels = ml_repo.get_names(MLObjectType.LABEL)
     else:
         labels = self.labels
     for l in labels:
         tmp = ml_repo.get(l)
         models_test[tmp.name].add(tmp.version)
     return models_test
Ejemplo n.º 6
0
 def test_default_constructor(self):
     #example with default
     ml_repo = MLRepo(user='******')
     #end example with default
     # If on of these test fail asince the logic has been modified, please update the documentation in basics.rst
     self.assertTrue(
         isinstance(ml_repo._ml_repo,
                    memory_handler.RepoObjectMemoryStorage))
     self.assertTrue(
         isinstance(ml_repo._numpy_repo, memory_handler.NumpyMemoryStorage))
     self.assertTrue(isinstance(ml_repo._job_runner, SimpleJobRunner))
Ejemplo n.º 7
0
    def test_config_disk_handler(self):
        #diskhandlerconfig
        config = {
            'user': '******',
            'workspace': 'tmp',
            'repo_store': {
                'type': 'disk_handler',
                'config': {
                    'folder': 'tmp/objects',
                    'file_format': 'json'
                }
            },
            'numpy_store': {
                'type': 'hdf_handler',
                'config': {
                    'folder': 'tmp/repo_data',
                    'version_files': True
                }
            },
            'job_runner': {
                'type': 'simple',
                'config': {}
            }
        }
        # end diskhandlerconfig

        # instantiate diskhandler
        ml_repo = MLRepo(config=config)
        # end instantiate diskhandler

        # instantiate diskhandler save config
        ml_repo = MLRepo(config=config, save_config=True)
        # end instantiate diskhandler save config

        # instantiate with workspace
        ml_repo = MLRepo(workspace='tmp')
Ejemplo n.º 8
0
 def _check(self, ml_repo: MLRepo):
     # check if test is based on latest test definition
     regression_test = ml_repo.get(self.test_definition,
                                   version=LAST_VERSION)
     if regression_test.repo_info.version != self.repo_info.modification_info[
             self.test_definition]:
         return 'Test is not based on latest test definition, latest version: ' + regression_test.repo_info.version + ', version used for test: ' + self.modification_info[
             self.test_definition]
     # check if measure config did not change
     if regression_test.measures is None:
         tmp = ml_repo.get_names(MLObjectType.MEASURE_CONFIGURATION)
         if len(tmp) == 0:
             raise Exception('No check possible since no measure defined.')
         m_config = ml_repo.get(tmp[0], version=LAST_VERSION)
         if m_config.repo_info.version != self.repo_info.modification_info[
                 m_config.repo_info.name]:
             return 'Test is not based on latest measure configuration, latest version: ' + m_config.repo_info.version + ', version used for test: ' + self.modification_info[
                 m_config.repo_info.name]
     #  check if ref model did not change
     label = ml_repo.get(regression_test.reference, version=LAST_VERSION)
     if not label.repo_info.name in self.repo_info.modification_info.keys():
         return 'Test on different reference model.'
     if not label.repo_info.version == self.repo_info.modification_info[
             label.repo_info.name]:
         return 'Test on old reference model.'
     # check if test was on latest data version
     if not self.data in self.repo_info.modification_info.keys():
         return 'Data of test has changed since last test.'
     version = self.data_version
     if version == LAST_VERSION:
         version = ml_repo._ml_repo.get_latest_version(self.data)
     elif version == FIRST_VERSION:
         version = ml_repo._ml_repo.get_first_version(self.data)
     if not version == self.repo_info.modification_info[self.data]:
         return 'Data of test has changed since last test.'
     return None
Ejemplo n.º 9
0
 def test_repo_RawData(self):
     """Test RawData within repo
     """
     repository = MLRepo(user='******')
     job_runner = SimpleJobRunner(repository)
     repository._job_runner = job_runner
     raw_data = repo_objects.RawData(
         np.zeros([10, 1]),
         ['test_coord'],
         repo_info={  # pylint: disable=E0602
             repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
         })
     repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
     raw_data_2 = repository.get('RawData_Test')
     self.assertEqual(len(raw_data.x_coord_names),
                      len(raw_data_2.x_coord_names))
     self.assertEqual(raw_data.x_coord_names[0],
                      raw_data_2.x_coord_names[0])
     commits = repository.get_commits()
     self.assertEqual(len(commits), 1)
     self.assertEqual(len(commits[0].objects), 1)
Ejemplo n.º 10
0
        'bigobj_handler',
        type=str,
        help='definition of store for the big object parts of the repo objects'
    )
    parser.add_argument(
        '-u',
        '--user',
        type=str,
        help=
        'username used internally, if not specified, the operating system user is used'
    )
    args = parser.parse_args()
    default_config = _get_default_config()
    default_config['workspace'] = args.workspace
    if args.user:
        default_config['user'] = args.user
    else:
        default_config['user'] = getpass.getuser()
    default_config['numpy_store']['config']['folder'] = args.workspace + '/hdf'
    default_config['repo_store']['config'][
        'folder'] = args.workspace + '/objects'
    _set_numpy_store_config(args.bigobj_handler, default_config)
    tmp = args.obj_handler.split('@')
    default_config['repo_store']['type'] = tmp[0]
    if tmp[0] == 'git_handler':
        if len(tmp) > 1:
            default_config['repo_store']['config']['remote'] = tmp[1]

    #print('Creating repo with config: ' + str(default_config))
    ml_repo = MLRepo(config=default_config, save_config=True)
Ejemplo n.º 11
0
        def test_integration(self):
            ml_repo = MLRepo(user='******')
            x = np.zeros([10, 1])
            y = np.zeros([10])
            # define eval and train
            ml_repo.add_eval_function(train, repo_name='my_eval_func')
            ml_repo.add_training_function(eval, repo_name='my_eval_func')
            # end define eval and train

            # define add training parameter
            training_param = SuperMLTrainingParam()
            training_param.median = True
            ml_repo.add(
                training_param,
                message=
                'my first training parameter for my own super ml algorithm')
            # end define add training parameter

            # add own model
            ml_repo.add_model('my_model')
            # end add own model
            ml_repo.run_training()
Ejemplo n.º 12
0
    def test_repo_training_test_data(self):
        # init repository with sample in memory handler
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        training_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'})
        repository.add(training_data, category=MLObjectType.TRAINING_DATA)

        training_data_2 = repository.get_training_data()
        self.assertEqual(
            training_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            training_data.repo_info[repo_objects.RepoInfoKey.NAME])

        test_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'})
        repository.add(test_data, category=MLObjectType.TEST_DATA)
        test_data_ref = repository.get('test_data')
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data.repo_info[repo_objects.RepoInfoKey.NAME])
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION],
            test_data.repo_info[repo_objects.RepoInfoKey.VERSION])

        test_data_2 = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'})
        repository.add(test_data_2, category=MLObjectType.TEST_DATA)
        test_data_2_ref = repository.get('test_data_2')
        self.assertEqual(
            test_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME])

        commits = repository.get_commits()
        self.assertEqual(len(commits), 3)
        self.assertEqual(commits[1].objects['test_data'],
                         test_data.repo_info.version)
        #self.assertEqual(commits[1].objects['repo_mapping'], 1)
        self.assertEqual(commits[2].objects['test_data_2'],
                         test_data_2.repo_info.version)
Ejemplo n.º 13
0
    def setUp(self):
        '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc.
        '''
        self.repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(self.repository)
        self.repository._job_runner = job_runner
        #### Setup dummy RawData
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        ## Setup dummy Test and Training DataSets on RawData
        training_data = DataSet('raw_1',
                                0,
                                None,
                                repo_info={
                                    repo_objects.RepoInfoKey.NAME.value:
                                    'training_data_1',
                                    repo_objects.RepoInfoKey.CATEGORY:
                                    MLObjectType.TRAINING_DATA
                                })
        test_data_1 = DataSet('raw_2',
                              0,
                              None,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_1',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        test_data_2 = DataSet('raw_3',
                              0,
                              2,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_2',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        self.repository.add([training_data, test_data_1, test_data_2])

        ## setup dummy preprocessor
        self.repository.add_preprocessing_transforming_function(
            preprocessor_transforming_function_test,
            repo_name='transform_func')
        self.repository.add_preprocessing_fitting_function(
            preprocessor_fitting_function_test, repo_name='fit_func')
        self.repository.add_preprocessor('test_preprocessor_with_fitting',
                                         'transform_func',
                                         'fit_func',
                                         preprocessor_param=None)

        self.repository.add_eval_function(eval_func_test, 'eval_func')
        self.repository.add_training_function(train_func_test, 'train_func')
        self.repository.add(
            TestClass(
                1,
                2,
                repo_info={
                    repo_objects.RepoInfoKey.NAME.value: 'training_param',  # pylint: disable=E1123
                    repo_objects.RepoInfoKey.CATEGORY:
                    MLObjectType.TRAINING_PARAM
                }))
        ## setup dummy model definition
        self.repository.add_model(
            'model',
            'eval_func',
            'train_func',
            preprocessors=['test_preprocessor_with_fitting'])
        # setup measure configuration
        self._setup_measure_config()
        # add dummy calibrated model
        self._add_calibrated_model()
Ejemplo n.º 14
0
class RepoTest(unittest.TestCase):
    def _setup_measure_config(self):
        """Add a measure configuration with two measures (both MAX) where one measure just uses the coordinate x0
        """

        measure_config = repo_objects.MeasureConfiguration(
            [(repo_objects.MeasureConfiguration.MAX, ['y0']),
             repo_objects.MeasureConfiguration.MAX],
            repo_info={RepoInfoKey.NAME.value: 'measure_config'})
        self.repository.add(measure_config,
                            category=MLObjectType.MEASURE_CONFIGURATION,
                            message='adding measure configuration')

    def _add_calibrated_model(self):
        self.repository.run_training()
        self.repository.set_label('prod')

    def setUp(self):
        '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc.
        '''
        self.repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(self.repository)
        self.repository._job_runner = job_runner
        #### Setup dummy RawData
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        ## Setup dummy Test and Training DataSets on RawData
        training_data = DataSet('raw_1',
                                0,
                                None,
                                repo_info={
                                    repo_objects.RepoInfoKey.NAME.value:
                                    'training_data_1',
                                    repo_objects.RepoInfoKey.CATEGORY:
                                    MLObjectType.TRAINING_DATA
                                })
        test_data_1 = DataSet('raw_2',
                              0,
                              None,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_1',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        test_data_2 = DataSet('raw_3',
                              0,
                              2,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_2',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        self.repository.add([training_data, test_data_1, test_data_2])

        ## setup dummy preprocessor
        self.repository.add_preprocessing_transforming_function(
            preprocessor_transforming_function_test,
            repo_name='transform_func')
        self.repository.add_preprocessing_fitting_function(
            preprocessor_fitting_function_test, repo_name='fit_func')
        self.repository.add_preprocessor('test_preprocessor_with_fitting',
                                         'transform_func',
                                         'fit_func',
                                         preprocessor_param=None)

        self.repository.add_eval_function(eval_func_test, 'eval_func')
        self.repository.add_training_function(train_func_test, 'train_func')
        self.repository.add(
            TestClass(
                1,
                2,
                repo_info={
                    repo_objects.RepoInfoKey.NAME.value: 'training_param',  # pylint: disable=E1123
                    repo_objects.RepoInfoKey.CATEGORY:
                    MLObjectType.TRAINING_PARAM
                }))
        ## setup dummy model definition
        self.repository.add_model(
            'model',
            'eval_func',
            'train_func',
            preprocessors=['test_preprocessor_with_fitting'])
        # setup measure configuration
        self._setup_measure_config()
        # add dummy calibrated model
        self._add_calibrated_model()

    def test_adding_training_data_exception(self):
        '''Tests if adding new training data leads to an exception
        '''
        with self.assertRaises(Exception):
            test_obj = DataSet('raw_data',
                               repo_info={
                                   repo_objects.RepoInfoKey.CATEGORY:
                                   MLObjectType.TRAINING_DATA.value,
                                   'name':
                                   'test_object'
                               })
            self.repository.add(test_obj)

    def test_commit_increase_update(self):
        '''Check if updating an object in repository increases commit but does not change mapping
        '''
        obj = self.repository.get('raw_1')
        old_num_commits = len(self.repository.get_commits())
        old_version_mapping = self.repository.get('repo_mapping').repo_info[
            RepoInfoKey.VERSION]
        self.repository.add(obj)
        new_num_commits = len(self.repository.get_commits())
        new_version_mapping = self.repository.get('repo_mapping').repo_info[
            RepoInfoKey.VERSION]
        self.assertEqual(old_num_commits + 1, new_num_commits)
        self.assertEqual(old_version_mapping, new_version_mapping)

    def test_commit_increase_add(self):
        '''Check if adding a new object in repository increases commit and does also change the mapping
        '''
        obj = DataSet('raw_data_1',
                      0,
                      None,
                      repo_info={
                          RepoInfoKey.NAME.value: 'test...',
                          RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA
                      })
        old_num_commits = len(self.repository.get_commits())
        old_version_mapping = self.repository.get(
            'repo_mapping').repo_info.version
        self.repository.add(obj)
        new_num_commits = len(self.repository.get_commits())
        new_version_mapping = self.repository.get(
            'repo_mapping').repo_info.version
        self.assertEqual(old_num_commits + 1, new_num_commits)
        commits = self.repository.get_commits()

    def test_DataSet_get(self):
        '''Test if getting a DataSet does include all informations from the underlying RawData (excluding numpy data)
        '''
        obj = self.repository.get('test_data_1')
        raw_obj = self.repository.get(obj.raw_data)
        for i in range(len(raw_obj.x_coord_names)):
            self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i])
        for i in range(len(raw_obj.y_coord_names)):
            self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i])

    def test_DataSet_get_full(self):
        '''Test if getting a DataSet does include all informations from the underlying RawData (including numpy data)
        '''
        obj = self.repository.get('test_data_1',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        raw_obj = self.repository.get(
            obj.raw_data,
            version=repo_store.RepoStore.LAST_VERSION,
            full_object=True)
        for i in range(len(raw_obj.x_coord_names)):
            self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i])
        for i in range(len(raw_obj.y_coord_names)):
            self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i])
        self.assertEqual(raw_obj.x_data.shape[0], obj.x_data.shape[0])

        obj = self.repository.get('test_data_2',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        self.assertEqual(obj.x_data.shape[0], 2)

        obj = self.repository.get('training_data_1',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        self.assertEqual(obj.x_data.shape[0], 10)

    def test_repo_RawData(self):
        """Test RawData within repo
        """
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]),
            ['test_coord'],
            repo_info={  # pylint: disable=E0602
                repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
            })
        repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
        raw_data_2 = repository.get('RawData_Test')
        self.assertEqual(len(raw_data.x_coord_names),
                         len(raw_data_2.x_coord_names))
        self.assertEqual(raw_data.x_coord_names[0],
                         raw_data_2.x_coord_names[0])
        commits = repository.get_commits()
        self.assertEqual(len(commits), 1)
        self.assertEqual(len(commits[0].objects), 1)

    def test_add_model_defaults(self):
        """test add_model using defaults to check whether default logic applies correctly
        """
        model_param = TestClass(3,
                                4,
                                repo_info={
                                    RepoInfoKey.NAME.value:
                                    'model_param',
                                    RepoInfoKey.CATEGORY:
                                    MLObjectType.MODEL_PARAM.value
                                })  # pylint: disable=E1123
        self.repository.add(model_param)
        training_param = TestClass(3,
                                   4,
                                   repo_info={
                                       RepoInfoKey.NAME.value:
                                       'training_param',
                                       RepoInfoKey.CATEGORY:
                                       MLObjectType.TRAINING_PARAM.value
                                   })  # pylint: disable=E1123
        self.repository.add(training_param)
        self.repository.add_model('model1')
        model = self.repository.get('model1')
        self.assertEqual(model.eval_function, 'eval_func')
        self.assertEqual(model.training_function, 'train_func')
        self.assertEqual(model.training_param, 'training_param')
        self.assertEqual(model.model_param, 'model_param')

    def test_get_history(self):
        training_data_history = self.repository.get_history('training_data_1')
        self.assertEqual(len(training_data_history), 1)
        training_data = self.repository.get('training_data_1')
        self.repository.add(training_data)
        training_data_history = self.repository.get_history('training_data_1')
        self.assertEqual(len(training_data_history), 2)

    def test_run_eval_defaults(self):
        '''Test running evaluation with default arguments
        '''
        self.repository.run_evaluation()

    def test_run_train_defaults(self):
        '''Test running training with default arguments
        '''
        self.repository.run_training()

    def test_run_measure_defaults(self):
        self.repository.run_evaluation(
        )  # run first the evaluation so that there is at least one evaluation
        self.repository.run_measures()

    def test_repo_training_test_data(self):
        # init repository with sample in memory handler
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        training_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'})
        repository.add(training_data, category=MLObjectType.TRAINING_DATA)

        training_data_2 = repository.get_training_data()
        self.assertEqual(
            training_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            training_data.repo_info[repo_objects.RepoInfoKey.NAME])

        test_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'})
        repository.add(test_data, category=MLObjectType.TEST_DATA)
        test_data_ref = repository.get('test_data')
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data.repo_info[repo_objects.RepoInfoKey.NAME])
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION],
            test_data.repo_info[repo_objects.RepoInfoKey.VERSION])

        test_data_2 = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'})
        repository.add(test_data_2, category=MLObjectType.TEST_DATA)
        test_data_2_ref = repository.get('test_data_2')
        self.assertEqual(
            test_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME])

        commits = repository.get_commits()
        self.assertEqual(len(commits), 3)
        self.assertEqual(commits[1].objects['test_data'],
                         test_data.repo_info.version)
        #self.assertEqual(commits[1].objects['repo_mapping'], 1)
        self.assertEqual(commits[2].objects['test_data_2'],
                         test_data_2.repo_info.version)
        #self.assertEqual(commits[2].objects['repo_mapping'], 2)

    def test_repo_RegressionTest(self):
        regression_test_def = ml_tests.RegressionTestDefinition(
            repo_info={
                RepoInfoKey.NAME: 'regression_test',
                RepoInfoKey.CATEGORY: MLObjectType.TEST_DEFINITION.name
            })
        tests = regression_test_def.create(self.repository)
        self.assertEqual(len(tests), 3)
        self.repository.add(regression_test_def)
        self.repository.run_evaluation()
        self.repository.run_measures()
        self.repository.run_tests()

    def test_add_multiple(self):
        """Test adding multiple objects at once
        """
        obj1 = TestClass(5, 4, repo_info={})
        obj1.repo_info.name = 'obj1'
        v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL)
        obj2 = TestClass(2, 3, repo_info={})
        obj2.repo_info.name = 'obj2'
        self.repository.add([obj1, obj2],
                            category=MLObjectType.CALIBRATED_MODEL)
        new_obj1 = self.repository.get('obj1')
        self.assertEqual(new_obj1.repo_info.name, 'obj1')
        new_obj2 = self.repository.get('obj2')
        self.assertEqual(new_obj2.repo_info.name, 'obj2')

    def test_delete(self):
        """Test if deletion works and if it considers if there are dependencies to other objects
        """

        obj1 = TestClass(5, 4, repo_info={})
        obj1.repo_info.name = 'obj1'
        v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL)
        obj2 = TestClass(2, 3, repo_info={})
        obj2.repo_info.name = 'obj2'
        obj2.repo_info.modification_info = {'obj1': v1}
        v2 = self.repository.add(obj2, category=MLObjectType.CALIBRATED_MODEL)
        # check if an exception is thrown if one tries to delete obj1 although obj2 has
        # a dependency on obj1
        try:
            self.repository.delete('obj1', v1)
            self.assertEqual(0, 1)
        except:
            pass
        # now first delete obj2
        self.repository.delete('obj2', v2)
        # check if obj2 has really been deleted
        try:
            obj2 = self.repository.get('obj2')
            self.assertEqual(0, 1)
        except:
            pass

        #now, deletion of obj 1 should work
        try:
            self.repository.delete('obj1', v1)
        except:
            self.assertEqual(0, 1)
        try:  #check if object really has been deleted
            obj1 = self.repository.get('obj1')
            self.assertEqual(0, 1)
        except:
            pass
Ejemplo n.º 15
0
    def test_tutorial(self):
        # cleanup disk before running
        repo_path = './tmp_tutorial'
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass

        # creating in memory storage
        ml_repo = MLRepo(user='******')
        # end creating in memory storage

        # creating new repository
        config = {'user': '******',
                  'workspace': repo_path,
                  'repo_store':
                  {
                      'type': 'disk_handler',
                      'config': {
                          'folder': repo_path,
                          'file_format': 'pickle'
                      }
                  },
                  'numpy_store':
                  {
                      'type': 'hdf_handler',
                      'config': {
                          'folder': repo_path,
                          'version_files': True
                      }
                  },
                  'job_runner':
                  {
                      'type': 'simple',
                      'config': {
                          'throw_job_error': True
                      }
                  }
                  }
        ml_repo = MLRepo(user='******', config=config)
        # end creating new repository
        # specifying job runner
        job_runner = SimpleJobRunner(None)
        job_runner.set_repo(ml_repo)
        ml_repo._job_runner = job_runner
        # end specifying job runner
        job_runner._throw_job_error = True

        from pailab.tools.tree import MLTree
        MLTree.add_tree(ml_repo)

        # A convenient way to add RawData is simply to use the method add on the raw_data collection.
        # This method just takes a pandas dataframe and the specification, which columns belong to the input
        # and which to the targets.

        try:
            # read pandas
            import pandas as pd
            data = pd.read_csv('./examples/boston_housing/housing.csv')
            # end read pandas
        except:
            data = pd.read_csv('../examples/boston_housing/housing.csv')

        # extract data
        input_variables = ['RM', 'LSTAT', 'PTRATIO']
        target_variables = ['MEDV']
        x = data.loc[:, input_variables].values
        y = data.loc[:, target_variables].values
        # end extract data

        # add RawData snippet
        from pailab import RawData, RepoInfoKey

        raw_data = RawData(x, input_variables, y, target_variables, repo_info={
                           RepoInfoKey.NAME: 'raw_data/boston_housing'})
        ml_repo.add(raw_data)

        # end adding RawData snippet
        # ml_repo.tree.raw_data.add('boston_housing', data, input_variables=[
        #    'RM', 'LSTAT', 'PTRATIO'], target_variables=['MEDV'])

        # add DataSet
        # create DataSet objects for training and test data
        training_data = DataSet('raw_data/boston_housing', 0, 300,
                                repo_info={RepoInfoKey.NAME: 'training_data', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA})
        test_data = DataSet('raw_data/boston_housing', 301, None,
                            repo_info={RepoInfoKey.NAME: 'test_data',  RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA})
        # add the objects to the repository
        version_list = ml_repo.add(
            [training_data, test_data], message='add training and test data')
        # end adding DataSet

        # add model
        import pailab.externals.sklearn_interface as sklearn_interface
        from sklearn.tree import DecisionTreeRegressor
        sklearn_interface.add_model(
            ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})
        # end adding model

        # run training
        job_id = ml_repo.run_training()
        # end running training

        # run evaluation
        job_id = ml_repo.run_evaluation()
        # end running evaluation

        # add measures snippet
        ml_repo.add_measure(MeasureConfiguration.MAX)
        ml_repo.add_measure(MeasureConfiguration.R2)
        # end add measure snippet

        # run measures snippet
        job_ids = ml_repo.run_measures()
        # end run measures snippet

        print(ml_repo.get_names(MLObjectType.MEASURE))

        # get measures
        max_measure = ml_repo.get(
            'DecisionTreeRegressor/measure/training_data/max')
        print(str(max_measure.value))
        # end getting measures

        # label snippet
        from pailab import LAST_VERSION
        ml_repo.set_label('prod', 'DecisionTreeRegressor/model',
                          model_version=LAST_VERSION, message='we found our first production model')
        # end label snippet

        # test definition snippet
        import pailab.tools.tests
        reg_test = pailab.tools.tests.RegressionTestDefinition(
            reference='prod', models=None, data=None, labels=None,
            measures=[MeasureConfiguration.MAX],  tol=1000)
        reg_test.repo_info.name = 'reg_test'
        ml_repo.add(reg_test, message='regression test definition')
        # end test definition snippet

        # add test snippet
        tests = ml_repo.run_tests()
        # end add test snippet
        print(tests)

        # run check snippet
        import pailab.tools.checker as checker
        inconsistencies = checker.run(ml_repo)
        # end run check snippet

        print(inconsistencies)

        # add inconsistency snippet
        param = ml_repo.get('DecisionTreeRegressor/model_param')
        param.sklearn_params['max_depth'] = 2
        version = ml_repo.add(param)
        # end add inconsistency snippet

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_training()

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_evaluation(run_descendants=True)

        print(checker.run(ml_repo))

        # add second test data snippet
        test_data_2 = DataSet('raw_data/boston_housing', 0, 50,
                              repo_info={RepoInfoKey.NAME: 'test_data_2',
                                         RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA}
                              )
        ml_repo.add(test_data_2)
        ml_repo.run_evaluation(run_descendants=True)
        # end add second test data snippet

        print(checker.Tests.run(ml_repo))

        ml_repo.run_tests()

        # check tests
        print(checker.Tests.run(ml_repo))
        # end check tests

        # cleanup after running
        # job_runner.close_connection()
        ml_repo._ml_repo.close_connection()
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass