Esempio n. 1
0
    def test_pipeline_optimizer_missing_values_categorical_attributes_run_works_fine(
            self):
        data_reader = CSVDataReader(
            src=os.path.dirname(os.path.abspath(__file__)) +
            '/tests_files/dataset_header_classes_cat_miss.csv',
            has_header=True,
            contains_classes=True)
        ppo = PipelineOptimizer(
            data=self.__data_reader,
            feature_selection_algorithms=['SelectKBest', 'SelectPercentile'],
            feature_transform_algorithms=['Normalizer', 'StandardScaler'],
            classifiers=['AdaBoost', 'Bagging'],
            categorical_features_encoder='OneHotEncoder',
            imputer='SimpleImputer',
            log=False)

        pipeline = ppo.run('Accuracy', 10, 10, 20, 20,
                           'ParticleSwarmAlgorithm')
        self.assertIsInstance(pipeline, Pipeline)
        self.assertTrue(
            isinstance(pipeline.get_classifier(), AdaBoost)
            or isinstance(pipeline.get_classifier(), Bagging))
        self.assertTrue(
            isinstance(pipeline.get_feature_selection_algorithm(), SelectKBest)
            or isinstance(pipeline.get_feature_selection_algorithm(),
                          SelectPercentile))
        self.assertTrue(
            pipeline.get_feature_transform_algorithm() is None or isinstance(
                pipeline.get_feature_transform_algorithm(), Normalizer)
            or isinstance(pipeline.get_feature_transform_algorithm(),
                          StandardScaler))
 def analyze_data(
     self,
     src: str,
     fitness_name: str,
     population_size: uint,
     number_of_evaluations: uint,
     optimization_algorithm: str,
     classifiers: Iterable,
     feature_selection_algorithms: Iterable = None,
     feature_transform_algorithms: Iterable = None,
     imputer: str = None,
 ) -> Pipeline:
     """
     Method for running AutoML process using NiaAML
     PipelineOptimizer class instance.\n
     Args:
         src (str):
             path to a CSV file
         fitness_name (str):
             name of the fitness class to use as a function
         population_size (uint):
             number of individuals in the optimization process
         number_of_evaluations (uint):
             number of maximum evaluations
         optimization_algorithm (str):
             name of the optimization algorithm to use
         classifiers (Iterable[Classifier]):
             array of names of possible classifiers
         feature_selection_algorithms (Optional[Iterable[str]]):
             array of names of possible feature selection algorithms
         feature_transform_algorithms (Optional[Iterable[str]]):
             array of names of possible feature transform algorithms
         imputer (Optional[str]):
             name of the imputer used for features
             that contain missing values
     Returns:
         Pipeline: instance of Pipeline object from the NiaAML framework
     Note:
         See NiaAML's documentation for more details on possible
         input parameters' values and further usage of the
         returned Pipeline object.
     """
     data = CSVDataReader(src=src, contains_classes=True, has_header=True)
     pipeline_optimizer = PipelineOptimizer(
         data=data,
         classifiers=classifiers,
         feature_selection_algorithms=feature_selection_algorithms,
         feature_transform_algorithms=feature_transform_algorithms,
         imputer=imputer,
     )
     pipeline = pipeline_optimizer.run_v1(
         fitness_name,
         population_size,
         number_of_evaluations,
         optimization_algorithm
     )
     return pipeline
Esempio n. 3
0
    def run(self):
        dataReader = CSVDataReader(src=self.__data.csvSrc, has_header=self.__data.csvHasHeader)
        optimizer = PipelineOptimizer(
            data=dataReader,
            feature_selection_algorithms=self.__data.fsas,
            feature_transform_algorithm=self.__data.ftas,
            classifiers=self.__data.classifiers,
            categorical_features_encoder=self.__data.encoder,
            imputer=self.__data.imputer
        )
        optimizer._PipelineOptimizer__logger = HackyLogger(self.progress.emit)

        if self.__data.isOptimization is True:
            pipeline = optimizer.run(self.__data.fitnessFunctionName, self.__data.popSize, self.__data.popSizeInner, self.__data.numEvals, self.__data.numEvalsInner, self.__data.optAlgName, self.__data.optAlgInnerName)
        else:
            pipeline = optimizer.run_v1(self.__data.fitnessFunctionName, self.__data.popSize, self.__data.numEvals, self.__data.optAlgName)

        pipeline.export(os.path.join(self.__data.outputFolder, 'niaamlGUIoutput'))
        pipeline.export_text(os.path.join(self.__data.outputFolder, 'niaamlGUIoutput'))
        self.optimized.emit(pipeline.to_string())
Esempio n. 4
0
 def test_pipeline_optimizeer_run_v1_works_fine(self):
     ppo = PipelineOptimizer(
         data=self.__data_reader,
         feature_selection_algorithms=['SelectKBest', 'SelectPercentile'],
         feature_transform_algorithms=['Normalizer', 'StandardScaler'],
         classifiers=['AdaBoost', 'Bagging'],
         log=False)
     pipeline = ppo.run_v1('Accuracy', 10, 20, 'ParticleSwarmAlgorithm')
     self.assertIsInstance(pipeline, Pipeline)
     self.assertTrue(
         isinstance(pipeline.get_classifier(), AdaBoost)
         or isinstance(pipeline.get_classifier(), Bagging))
     self.assertTrue(
         isinstance(pipeline.get_feature_selection_algorithm(), SelectKBest)
         or isinstance(pipeline.get_feature_selection_algorithm(),
                       SelectPercentile))
     self.assertTrue(
         pipeline.get_feature_transform_algorithm() is None or isinstance(
             pipeline.get_feature_transform_algorithm(), Normalizer)
         or isinstance(pipeline.get_feature_transform_algorithm(),
                       StandardScaler))
Esempio n. 5
0
    def test_pipeline_optimizer_getters_work_fine(self):
        ppo = PipelineOptimizer(
            data=self.__data_reader,
            feature_selection_algorithms=['SelectKBest', 'SelectPercentile'],
            feature_transform_algorithms=['Normalizer', 'StandardScaler'],
            classifiers=['AdaBoost', 'Bagging'],
            log=False)

        fsas = ppo.get_feature_selection_algorithms()
        ftas = ppo.get_feature_transform_algorithms()
        classifiers = ppo.get_classifiers()

        self.assertEqual(ppo.get_data(), self.__data_reader)
        self.assertTrue(
            (numpy.array(['AdaBoost',
                          'Bagging']) == numpy.array(classifiers)).all())
        self.assertTrue(
            (numpy.array(['SelectKBest',
                          'SelectPercentile']) == numpy.array(fsas)).all())

        self.assertTrue((numpy.array(
            [None, 'Normalizer', 'StandardScaler'] == numpy.array(ftas))
                         ).all())
We use a dataset with 1 categorical feature and missing values to demonstrate a use of PipelineOptimizer instance with automatic feature encoding and imputation.
"""

# prepare data reader using csv file
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True)

# instantiate PipelineOptimizer that chooses among specified classifiers, feature selection algorithms and feature transform algorithms
# OneHotEncoder is used for encoding categorical features in this example
# SimpleImputer is used for imputing missing values in this example
# log is True by default, log_verbose means more information if True, log_output_file is the destination of a log file
# if log_output_file is not provided there is no file created
# if log is False, logging is turned off
pipeline_optimizer = PipelineOptimizer(
    data=data_reader,
    classifiers=['AdaBoost', 'Bagging', 'MultiLayerPerceptron', 'RandomForest', 'ExtremelyRandomizedTrees', 'LinearSVC'],
    feature_selection_algorithms=['SelectKBest', 'SelectPercentile', 'ParticleSwarmOptimization', 'VarianceThreshold'],
    feature_transform_algorithms=['Normalizer', 'StandardScaler'],
    categorical_features_encoder='OneHotEncoder',
    imputer='SimpleImputer',
    log=True,
    log_verbose=True,
    log_output_file='output.log'
)

# runs the optimization process
# one of the possible pipelines in this case is: SelectPercentile -> Normalizer -> RandomForest
# returns the best found pipeline
# the chosen fitness function and optimization algorithm are Accuracy and Particle Swarm Algorithm
pipeline = pipeline_optimizer.run('Accuracy', 10, 10, 30, 30, 'ParticleSwarmAlgorithm', 'ParticleSwarmAlgorithm')

# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file