def setUp(self) -> None: self.arti = AI('test', 'test2') data = {'col1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0], 'col2': ['train', 'eval', 'train', 'eval', 'eval', 'train1', 'eval', 'train', 'train', 'train1']} self.dataframe = pd.DataFrame(data=data)
def test_constructor_build(self): data_builder = DataBuilder(data_source='C:/python/practice2/AIBuilder/tests/data/test_data.csv', target_column='target_1', data_columns=['feature_1', 'feature_2', 'feature_3']) arti = AI(project_name='name', log_dir='path/to/dir') data_builder.validate() data_builder.build(ml_model=arti) column_names = ['feature_1', 'feature_2', 'feature_3', 'target_1'] self.validate_data_frame(arti.training_data, column_names)
def create_AI(self, builders: list, ai_name: str = None) -> AbstractAI: self.validate_builders(builders) ml_model = AI(self.project_name, self.log_dir, ai_name) self.dispatcher.dispatch( ModelEvent(KernelDispatcher.PRE_RUN_BUILDERS, ml_model)) name = ml_model.get_name() description = {} for builder in builders: self.console_printer.line('running: ' + builder.__class__.__name__) if isinstance(builder, EstimatorBuilder): # overwrite old name from being loaded from cache. ml_model.set_name(name) result = builder.build(ml_model) if result is not None: # Result will be None if cache prevents execution, keep using the old model as it has the description, # which is required for determining function cache key. # todo: fix this, we should not be passing a model just because the cache needs the description. ml_model = result description[builder.builder_type] = builder.describe() ml_model.description = description return ml_model
def run(self): self.model = None pre_run_event = TesterEvent(event_name=KernelDispatcher.PRE_RUN, session=self.session, tester=self.tester) self.dispatcher.dispatch(pre_run_event) while self.factory.has_next_ai(): #todo: move the new is unique logic somewhere else and changing sequence of operation seems to difficult, smell? final_description = self.preview_model_description() final_description_hash = AITester.stable_hash_description( self.preview_model_description()) log_dir_path = self.factory.log_dir if not AITester.is_hash_unique_to_report( log_dir_path=log_dir_path, description_hash=final_description_hash): self.factory.builder_permutations.pop() #todo: some kind of null model, as it is just used to pervey name and log dir? model = AI(self.factory.project_name, self.factory.log_dir, self.factory.project_name + '_X') model.description = final_description self.tester.ml_model = model self.ModelNotUnique() continue self.doCreateModel() if self.train: self.doTrainModel() self.all_models.append(self.model) if self.evaluate: self.doEvaluateModel() if self.predict: self.prediction_results = self.doPredict() post_run_event = TesterEvent(event_name=KernelDispatcher.POST_RUN, session=self.session, tester=self.tester) self.dispatcher.dispatch(post_run_event)
def setUp(self): strategy_class = self.get_strategy_class_name() self.ml_model = AI(project_name='test', log_dir='test/dir', name='test') self.ml_model.optimizer = mock.Mock() self.ml_model.training_data = mock.Mock() self.ml_model.training_data.get_tf_feature_columns = mock.Mock() kwargs = {'ml_model': self.ml_model} kwargs.update(self.additional_parameters()) self.strategy = strategy_class(**kwargs)
def setUp(self): data_array = {'feat_A': [1, 2, 3], 'feat_B': [8, 6, 4], 'target': [9, 8, 7]} df2 = df = pd.DataFrame(data_array) train_model = DataModel(df) train_model.set_target_column('target') train_model.set_feature_columns(['feat_A', 'feat_B']) eval_model = DataModel(df2) eval_model.set_target_column('target') eval_model.set_feature_columns(['feat_A', 'feat_B']) self.arti = AI('test', 'test/test') self.arti.training_data = train_model self.arti.evaluation_data = eval_model
def setUp(self): self.ml_model = AI(project_name='test', log_dir='test/dir', name='test')
class TestCategoricalDataSplitter(unittest.TestCase): def setUp(self) -> None: self.arti = AI('test', 'test2') data = {'col1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0], 'col2': ['train', 'eval', 'train', 'eval', 'eval', 'train1', 'eval', 'train', 'train', 'train1']} self.dataframe = pd.DataFrame(data=data) def test_build_training_data(self): builder = CategoricalDataSplitter(data_source='training', column_name='col2', training_categories=['train', 'train1']) training_model = DataModel(self.dataframe) self.arti.set_training_data(training_model) self.arti = builder.build(ml_model=self.arti) split_training_data = self.arti.get_training_data().get_dataframe() split_evaluation_data = self.arti.get_evaluation_data().get_dataframe() self.assertEqual(6, len(split_training_data)) for item in split_training_data.values: self.assertTrue('train' in item[1]) self.assertTrue(item[0] in [1, 2, 3, 6, 8, 9, 0], f' item {item[0]} not found.') self.assertEqual(4, len(split_evaluation_data)) for item in split_evaluation_data.values: self.assertEqual('eval', item[1]) self.assertTrue(item[0] in [2, 4, 5, 7], f' item {item[0]} not found.') def test_build_evaluation_data(self): builder = CategoricalDataSplitter(data_source='evaluation', column_name='col2', training_categories=['train', 'train1']) evaluation_model = DataModel(self.dataframe) self.arti.set_evaluation_data(evaluation_model) self.arti = builder.build(ml_model=self.arti) split_training_data = self.arti.get_training_data().get_dataframe() split_evaluation_data = self.arti.get_evaluation_data().get_dataframe() self.assertEqual(6, len(split_training_data)) for item in split_training_data.values: self.assertTrue('train' in item[1]) self.assertTrue(item[0] in [1, 2, 3, 6, 8, 9, 0], f' item {item[0]} not found.') self.assertEqual(4, len(split_evaluation_data)) for item in split_evaluation_data.values: self.assertEqual('eval', item[1]) self.assertTrue(item[0] in [2, 4, 5, 7], f' item {item[0]} not found.') def test_evaluation_categories(self): builder = CategoricalDataSplitter(data_source='training', column_name='col2', eval_categories=['eval']) training_model = DataModel(self.dataframe) self.arti.set_training_data(training_model) self.arti = builder.build(ml_model=self.arti) split_training_data = self.arti.get_training_data().get_dataframe() split_evaluation_data = self.arti.get_evaluation_data().get_dataframe() self.assertEqual(6, len(split_training_data)) for item in split_training_data.values: self.assertTrue('train' in item[1]) self.assertTrue(item[0] in [1, 2, 3, 6, 8, 9, 0], f' item {item[0]} not found.') self.assertEqual(4, len(split_evaluation_data)) for item in split_evaluation_data.values: self.assertEqual('eval', item[1]) self.assertTrue(item[0] in [2, 4, 5, 7], f' item {item[0]} not found.') def test_both_categories(self): builder = CategoricalDataSplitter(data_source='training', column_name='col2', eval_categories=['eval'], training_categories=['train', 'train1']) training_model = DataModel(self.dataframe) self.arti.set_training_data(training_model) self.arti = builder.build(ml_model=self.arti) split_training_data = self.arti.get_training_data().get_dataframe() split_evaluation_data = self.arti.get_evaluation_data().get_dataframe() self.assertEqual(6, len(split_training_data)) for item in split_training_data.values: self.assertTrue('train' in item[1]) self.assertTrue(item[0] in [1, 2, 3, 6, 8, 9, 0], f' item {item[0]} not found.') self.assertEqual(4, len(split_evaluation_data)) for item in split_evaluation_data.values: self.assertEqual('eval', item[1]) self.assertTrue(item[0] in [2, 4, 5, 7], f' item {item[0]} not found.') def test_missing_categories(self): builder = CategoricalDataSplitter(data_source='training', column_name='col2', eval_categories=['eval'], training_categories=['train']) training_model = DataModel(self.dataframe) self.arti.set_training_data(training_model) with self.assertRaises(AssertionError): self.arti = builder.build(ml_model=self.arti) def test_no_categories(self): builder = CategoricalDataSplitter(data_source='training', column_name='col2') training_model = DataModel(self.dataframe) self.arti.set_training_data(training_model) with self.assertRaises(AssertionError): self.arti = builder.build(ml_model=self.arti)