Esempio n. 1
0
    def add(self, name, data, input_variables = None, target_variables = None):
        """Add raw data to the repository

        Arguments:
            data_name {name of data} -- the name of the data added
            data {pandas datatable} -- the data as pandas datatable
        
        Keyword Arguments:
            input_variables {list of strings} -- list of column names defining the input variables for the machine learning (default: {None}). If None, all variables are used as input
            target_variables {list of strings} -- list of column names defining the target variables for the machine learning (default: {None}). If None, no target data is added from the table.
        """
        path = 'raw_data/' + name

        if input_variables is None:
            input_variables = list(data)
            if not target_variables is None:
                [input_variables.remove(x) for x in target_variables]
        else:
            # check whether the input_variables are included in the data
            if not [item for item in input_variables if item in list(data)] == input_variables:
                raise Exception('RawData does not include at least one column included in input_variables')
      
        if target_variables is not None:
            # check if target variables are in list
            if not [item for item in target_variables if item in list(data)] == target_variables:
                raise Exception('RawData does not include at least one column included in target_variables')
            raw_data = repo_objects.RawData(data.loc[:, input_variables].values, input_variables, data.loc[:, target_variables].values, 
                target_variables, repo_info = {RepoInfoKey.NAME: path})
        else:
            raw_data = repo_objects.RawData(data.loc[:, input_variables].values, input_variables, repo_info = {RepoInfoKey.NAME: path})
        v = self._repo.add(raw_data, 'data ' + path + ' added to repository' , category = MLObjectType.RAW_DATA)
        obj = self._repo.get(path, version=v, full_object = False)
        setattr(self, name, _RawDataItem(path, self._repo, obj))
Esempio n. 2
0
 def test_constructor(self):
     # simple construction
     x_data = np.zeros([100, 4])
     x_names = ['x1', 'x2', 'x3', 'x4']
     test_data = repo_objects.RawData(x_data, x_names)
     self.assertEqual(test_data.x_data.shape[0], x_data.shape[0])
     self.assertEqual(test_data.n_data, 100)
     # construction from array
     x_data = np.zeros([100])
     test_data = repo_objects.RawData(x_data, ['x1'])
     self.assertEqual(len(test_data.x_data.shape), 2)
     self.assertEqual(test_data.x_data.shape[1], 1)
     # construction from list
     test_data = repo_objects.RawData([100, 200, 300], ['x1'])
     self.assertEqual(test_data.x_data.shape[0], 3)
     self.assertEqual(test_data.x_data.shape[1], 1)
Esempio n. 3
0
 def add_from_numpy_file(self, name, filename_X, x_names, filename_Y=None, y_names = None):
     path = name
     X = load(filename_X)
     Y = None
     if filename_Y is not None:
         Y = load(filename_Y)
     raw_data =  repo_objects.RawData(X, x_names, Y, y_names, repo_info = {RepoInfoKey.NAME: path})
     v = self._repo.add(raw_data, 'data ' + path + ' added to repository' , category = MLObjectType.RAW_DATA)
     obj = self._repo.get(path, version=v, full_object = False)
     setattr(self, name, _RawDataItem(path, self._repo, obj))
Esempio n. 4
0
    def test_validation(self):
        # test if validation works
        x_data = np.zeros([100, 2])

        # exception because number of coord_names does not equal number of x_coords
        with self.assertRaises(Exception):
            test_data = repo_objects.RawData(  # pylint: disable=W0612
                x_data, x_coord_names=[])  # pylint: disable=W0612
        # exception because number of y-coords does not equal number of x-coords
        y_data = np.zeros([99, 2])
        with self.assertRaises(Exception):
            test_data = repo_objects.RawData(  # pylint: disable=W0612
                x_data,
                x_coord_names=['x1', 'x2'],
                y_data=y_data,
                y_coord_names=['y1', 'y2'])
        # exception because number of y-coordnamess does not equal number of y-coords
        y_data = np.zeros([100, 2])
        with self.assertRaises(Exception):
            test_data = repo_objects.RawData(  # pylint: disable=W0612
                x_data,
                x_coord_names=['x1', 'x2'],
                y_data=y_data,
                y_coord_names=['y1'])
Esempio n. 5
0
 def test_repo_RawData(self):
     """Test RawData within repo
     """
     repository = MLRepo(user='******')
     job_runner = SimpleJobRunner(repository)
     repository._job_runner = job_runner
     raw_data = repo_objects.RawData(
         np.zeros([10, 1]),
         ['test_coord'],
         repo_info={  # pylint: disable=E0602
             repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
         })
     repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
     raw_data_2 = repository.get('RawData_Test')
     self.assertEqual(len(raw_data.x_coord_names),
                      len(raw_data_2.x_coord_names))
     self.assertEqual(raw_data.x_coord_names[0],
                      raw_data_2.x_coord_names[0])
     commits = repository.get_commits()
     self.assertEqual(len(commits), 1)
     self.assertEqual(len(commits[0].objects), 1)
Esempio n. 6
0
    def setUp(self):
        '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc.
        '''
        self.repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(self.repository)
        self.repository._job_runner = job_runner
        #### Setup dummy RawData
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        ## Setup dummy Test and Training DataSets on RawData
        training_data = DataSet('raw_1',
                                0,
                                None,
                                repo_info={
                                    repo_objects.RepoInfoKey.NAME.value:
                                    'training_data_1',
                                    repo_objects.RepoInfoKey.CATEGORY:
                                    MLObjectType.TRAINING_DATA
                                })
        test_data_1 = DataSet('raw_2',
                              0,
                              None,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_1',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        test_data_2 = DataSet('raw_3',
                              0,
                              2,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_2',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        self.repository.add([training_data, test_data_1, test_data_2])

        ## setup dummy preprocessor
        self.repository.add_preprocessing_transforming_function(
            preprocessor_transforming_function_test,
            repo_name='transform_func')
        self.repository.add_preprocessing_fitting_function(
            preprocessor_fitting_function_test, repo_name='fit_func')
        self.repository.add_preprocessor('test_preprocessor_with_fitting',
                                         'transform_func',
                                         'fit_func',
                                         preprocessor_param=None)

        self.repository.add_eval_function(eval_func_test, 'eval_func')
        self.repository.add_training_function(train_func_test, 'train_func')
        self.repository.add(
            TestClass(
                1,
                2,
                repo_info={
                    repo_objects.RepoInfoKey.NAME.value: 'training_param',  # pylint: disable=E1123
                    repo_objects.RepoInfoKey.CATEGORY:
                    MLObjectType.TRAINING_PARAM
                }))
        ## setup dummy model definition
        self.repository.add_model(
            'model',
            'eval_func',
            'train_func',
            preprocessors=['test_preprocessor_with_fitting'])
        # setup measure configuration
        self._setup_measure_config()
        # add dummy calibrated model
        self._add_calibrated_model()