def test_pipeline_optimizer_missing_values_categorical_attributes_run_works_fine( self): data_reader = CSVDataReader( src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes_cat_miss.csv', has_header=True, contains_classes=True) ppo = PipelineOptimizer( data=self.__data_reader, feature_selection_algorithms=['SelectKBest', 'SelectPercentile'], feature_transform_algorithms=['Normalizer', 'StandardScaler'], classifiers=['AdaBoost', 'Bagging'], categorical_features_encoder='OneHotEncoder', imputer='SimpleImputer', log=False) pipeline = ppo.run('Accuracy', 10, 10, 20, 20, 'ParticleSwarmAlgorithm') self.assertIsInstance(pipeline, Pipeline) self.assertTrue( isinstance(pipeline.get_classifier(), AdaBoost) or isinstance(pipeline.get_classifier(), Bagging)) self.assertTrue( isinstance(pipeline.get_feature_selection_algorithm(), SelectKBest) or isinstance(pipeline.get_feature_selection_algorithm(), SelectPercentile)) self.assertTrue( pipeline.get_feature_transform_algorithm() is None or isinstance( pipeline.get_feature_transform_algorithm(), Normalizer) or isinstance(pipeline.get_feature_transform_algorithm(), StandardScaler))
def analyze_data( self, src: str, fitness_name: str, population_size: uint, number_of_evaluations: uint, optimization_algorithm: str, classifiers: Iterable, feature_selection_algorithms: Iterable = None, feature_transform_algorithms: Iterable = None, imputer: str = None, ) -> Pipeline: """ Method for running AutoML process using NiaAML PipelineOptimizer class instance.\n Args: src (str): path to a CSV file fitness_name (str): name of the fitness class to use as a function population_size (uint): number of individuals in the optimization process number_of_evaluations (uint): number of maximum evaluations optimization_algorithm (str): name of the optimization algorithm to use classifiers (Iterable[Classifier]): array of names of possible classifiers feature_selection_algorithms (Optional[Iterable[str]]): array of names of possible feature selection algorithms feature_transform_algorithms (Optional[Iterable[str]]): array of names of possible feature transform algorithms imputer (Optional[str]): name of the imputer used for features that contain missing values Returns: Pipeline: instance of Pipeline object from the NiaAML framework Note: See NiaAML's documentation for more details on possible input parameters' values and further usage of the returned Pipeline object. """ data = CSVDataReader(src=src, contains_classes=True, has_header=True) pipeline_optimizer = PipelineOptimizer( data=data, classifiers=classifiers, feature_selection_algorithms=feature_selection_algorithms, feature_transform_algorithms=feature_transform_algorithms, imputer=imputer, ) pipeline = pipeline_optimizer.run_v1( fitness_name, population_size, number_of_evaluations, optimization_algorithm ) return pipeline
def run(self): dataReader = CSVDataReader(src=self.__data.csvSrc, has_header=self.__data.csvHasHeader) optimizer = PipelineOptimizer( data=dataReader, feature_selection_algorithms=self.__data.fsas, feature_transform_algorithm=self.__data.ftas, classifiers=self.__data.classifiers, categorical_features_encoder=self.__data.encoder, imputer=self.__data.imputer ) optimizer._PipelineOptimizer__logger = HackyLogger(self.progress.emit) if self.__data.isOptimization is True: pipeline = optimizer.run(self.__data.fitnessFunctionName, self.__data.popSize, self.__data.popSizeInner, self.__data.numEvals, self.__data.numEvalsInner, self.__data.optAlgName, self.__data.optAlgInnerName) else: pipeline = optimizer.run_v1(self.__data.fitnessFunctionName, self.__data.popSize, self.__data.numEvals, self.__data.optAlgName) pipeline.export(os.path.join(self.__data.outputFolder, 'niaamlGUIoutput')) pipeline.export_text(os.path.join(self.__data.outputFolder, 'niaamlGUIoutput')) self.optimized.emit(pipeline.to_string())
def test_pipeline_optimizeer_run_v1_works_fine(self): ppo = PipelineOptimizer( data=self.__data_reader, feature_selection_algorithms=['SelectKBest', 'SelectPercentile'], feature_transform_algorithms=['Normalizer', 'StandardScaler'], classifiers=['AdaBoost', 'Bagging'], log=False) pipeline = ppo.run_v1('Accuracy', 10, 20, 'ParticleSwarmAlgorithm') self.assertIsInstance(pipeline, Pipeline) self.assertTrue( isinstance(pipeline.get_classifier(), AdaBoost) or isinstance(pipeline.get_classifier(), Bagging)) self.assertTrue( isinstance(pipeline.get_feature_selection_algorithm(), SelectKBest) or isinstance(pipeline.get_feature_selection_algorithm(), SelectPercentile)) self.assertTrue( pipeline.get_feature_transform_algorithm() is None or isinstance( pipeline.get_feature_transform_algorithm(), Normalizer) or isinstance(pipeline.get_feature_transform_algorithm(), StandardScaler))
def test_pipeline_optimizer_getters_work_fine(self): ppo = PipelineOptimizer( data=self.__data_reader, feature_selection_algorithms=['SelectKBest', 'SelectPercentile'], feature_transform_algorithms=['Normalizer', 'StandardScaler'], classifiers=['AdaBoost', 'Bagging'], log=False) fsas = ppo.get_feature_selection_algorithms() ftas = ppo.get_feature_transform_algorithms() classifiers = ppo.get_classifiers() self.assertEqual(ppo.get_data(), self.__data_reader) self.assertTrue( (numpy.array(['AdaBoost', 'Bagging']) == numpy.array(classifiers)).all()) self.assertTrue( (numpy.array(['SelectKBest', 'SelectPercentile']) == numpy.array(fsas)).all()) self.assertTrue((numpy.array( [None, 'Normalizer', 'StandardScaler'] == numpy.array(ftas)) ).all())
We use a dataset with 1 categorical feature and missing values to demonstrate a use of PipelineOptimizer instance with automatic feature encoding and imputation. """ # prepare data reader using csv file data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) # instantiate PipelineOptimizer that chooses among specified classifiers, feature selection algorithms and feature transform algorithms # OneHotEncoder is used for encoding categorical features in this example # SimpleImputer is used for imputing missing values in this example # log is True by default, log_verbose means more information if True, log_output_file is the destination of a log file # if log_output_file is not provided there is no file created # if log is False, logging is turned off pipeline_optimizer = PipelineOptimizer( data=data_reader, classifiers=['AdaBoost', 'Bagging', 'MultiLayerPerceptron', 'RandomForest', 'ExtremelyRandomizedTrees', 'LinearSVC'], feature_selection_algorithms=['SelectKBest', 'SelectPercentile', 'ParticleSwarmOptimization', 'VarianceThreshold'], feature_transform_algorithms=['Normalizer', 'StandardScaler'], categorical_features_encoder='OneHotEncoder', imputer='SimpleImputer', log=True, log_verbose=True, log_output_file='output.log' ) # runs the optimization process # one of the possible pipelines in this case is: SelectPercentile -> Normalizer -> RandomForest # returns the best found pipeline # the chosen fitness function and optimization algorithm are Accuracy and Particle Swarm Algorithm pipeline = pipeline_optimizer.run('Accuracy', 10, 10, 30, 30, 'ParticleSwarmAlgorithm', 'ParticleSwarmAlgorithm') # pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file