Exemple #1
0
    def test_repo_training_test_data(self):
        # init repository with sample in memory handler
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        training_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'})
        repository.add(training_data, category=MLObjectType.TRAINING_DATA)

        training_data_2 = repository.get_training_data()
        self.assertEqual(
            training_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            training_data.repo_info[repo_objects.RepoInfoKey.NAME])

        test_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'})
        repository.add(test_data, category=MLObjectType.TEST_DATA)
        test_data_ref = repository.get('test_data')
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data.repo_info[repo_objects.RepoInfoKey.NAME])
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION],
            test_data.repo_info[repo_objects.RepoInfoKey.VERSION])

        test_data_2 = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'})
        repository.add(test_data_2, category=MLObjectType.TEST_DATA)
        test_data_2_ref = repository.get('test_data_2')
        self.assertEqual(
            test_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME])

        commits = repository.get_commits()
        self.assertEqual(len(commits), 3)
        self.assertEqual(commits[1].objects['test_data'],
                         test_data.repo_info.version)
        #self.assertEqual(commits[1].objects['repo_mapping'], 1)
        self.assertEqual(commits[2].objects['test_data_2'],
                         test_data_2.repo_info.version)
Exemple #2
0
 def test_repo_RawData(self):
     """Test RawData within repo
     """
     repository = MLRepo(user='******')
     job_runner = SimpleJobRunner(repository)
     repository._job_runner = job_runner
     raw_data = repo_objects.RawData(
         np.zeros([10, 1]),
         ['test_coord'],
         repo_info={  # pylint: disable=E0602
             repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
         })
     repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
     raw_data_2 = repository.get('RawData_Test')
     self.assertEqual(len(raw_data.x_coord_names),
                      len(raw_data_2.x_coord_names))
     self.assertEqual(raw_data.x_coord_names[0],
                      raw_data_2.x_coord_names[0])
     commits = repository.get_commits()
     self.assertEqual(len(commits), 1)
     self.assertEqual(len(commits[0].objects), 1)
Exemple #3
0
    def test_tutorial(self):
        # cleanup disk before running
        repo_path = './tmp_tutorial'
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass

        # creating in memory storage
        ml_repo = MLRepo(user='******')
        # end creating in memory storage

        # creating new repository
        config = {'user': '******',
                  'workspace': repo_path,
                  'repo_store':
                  {
                      'type': 'disk_handler',
                      'config': {
                          'folder': repo_path,
                          'file_format': 'pickle'
                      }
                  },
                  'numpy_store':
                  {
                      'type': 'hdf_handler',
                      'config': {
                          'folder': repo_path,
                          'version_files': True
                      }
                  },
                  'job_runner':
                  {
                      'type': 'simple',
                      'config': {
                          'throw_job_error': True
                      }
                  }
                  }
        ml_repo = MLRepo(user='******', config=config)
        # end creating new repository
        # specifying job runner
        job_runner = SimpleJobRunner(None)
        job_runner.set_repo(ml_repo)
        ml_repo._job_runner = job_runner
        # end specifying job runner
        job_runner._throw_job_error = True

        from pailab.tools.tree import MLTree
        MLTree.add_tree(ml_repo)

        # A convenient way to add RawData is simply to use the method add on the raw_data collection.
        # This method just takes a pandas dataframe and the specification, which columns belong to the input
        # and which to the targets.

        try:
            # read pandas
            import pandas as pd
            data = pd.read_csv('./examples/boston_housing/housing.csv')
            # end read pandas
        except:
            data = pd.read_csv('../examples/boston_housing/housing.csv')

        # extract data
        input_variables = ['RM', 'LSTAT', 'PTRATIO']
        target_variables = ['MEDV']
        x = data.loc[:, input_variables].values
        y = data.loc[:, target_variables].values
        # end extract data

        # add RawData snippet
        from pailab import RawData, RepoInfoKey

        raw_data = RawData(x, input_variables, y, target_variables, repo_info={
                           RepoInfoKey.NAME: 'raw_data/boston_housing'})
        ml_repo.add(raw_data)

        # end adding RawData snippet
        # ml_repo.tree.raw_data.add('boston_housing', data, input_variables=[
        #    'RM', 'LSTAT', 'PTRATIO'], target_variables=['MEDV'])

        # add DataSet
        # create DataSet objects for training and test data
        training_data = DataSet('raw_data/boston_housing', 0, 300,
                                repo_info={RepoInfoKey.NAME: 'training_data', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA})
        test_data = DataSet('raw_data/boston_housing', 301, None,
                            repo_info={RepoInfoKey.NAME: 'test_data',  RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA})
        # add the objects to the repository
        version_list = ml_repo.add(
            [training_data, test_data], message='add training and test data')
        # end adding DataSet

        # add model
        import pailab.externals.sklearn_interface as sklearn_interface
        from sklearn.tree import DecisionTreeRegressor
        sklearn_interface.add_model(
            ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})
        # end adding model

        # run training
        job_id = ml_repo.run_training()
        # end running training

        # run evaluation
        job_id = ml_repo.run_evaluation()
        # end running evaluation

        # add measures snippet
        ml_repo.add_measure(MeasureConfiguration.MAX)
        ml_repo.add_measure(MeasureConfiguration.R2)
        # end add measure snippet

        # run measures snippet
        job_ids = ml_repo.run_measures()
        # end run measures snippet

        print(ml_repo.get_names(MLObjectType.MEASURE))

        # get measures
        max_measure = ml_repo.get(
            'DecisionTreeRegressor/measure/training_data/max')
        print(str(max_measure.value))
        # end getting measures

        # label snippet
        from pailab import LAST_VERSION
        ml_repo.set_label('prod', 'DecisionTreeRegressor/model',
                          model_version=LAST_VERSION, message='we found our first production model')
        # end label snippet

        # test definition snippet
        import pailab.tools.tests
        reg_test = pailab.tools.tests.RegressionTestDefinition(
            reference='prod', models=None, data=None, labels=None,
            measures=[MeasureConfiguration.MAX],  tol=1000)
        reg_test.repo_info.name = 'reg_test'
        ml_repo.add(reg_test, message='regression test definition')
        # end test definition snippet

        # add test snippet
        tests = ml_repo.run_tests()
        # end add test snippet
        print(tests)

        # run check snippet
        import pailab.tools.checker as checker
        inconsistencies = checker.run(ml_repo)
        # end run check snippet

        print(inconsistencies)

        # add inconsistency snippet
        param = ml_repo.get('DecisionTreeRegressor/model_param')
        param.sklearn_params['max_depth'] = 2
        version = ml_repo.add(param)
        # end add inconsistency snippet

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_training()

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_evaluation(run_descendants=True)

        print(checker.run(ml_repo))

        # add second test data snippet
        test_data_2 = DataSet('raw_data/boston_housing', 0, 50,
                              repo_info={RepoInfoKey.NAME: 'test_data_2',
                                         RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA}
                              )
        ml_repo.add(test_data_2)
        ml_repo.run_evaluation(run_descendants=True)
        # end add second test data snippet

        print(checker.Tests.run(ml_repo))

        ml_repo.run_tests()

        # check tests
        print(checker.Tests.run(ml_repo))
        # end check tests

        # cleanup after running
        # job_runner.close_connection()
        ml_repo._ml_repo.close_connection()
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass