Exemple #1
0
def setup_multi_ruleset_classifier(rule_selection_method):
    ruleset_pos = utilities.create_numerical_test_ruleset_pos()
    ruleset_neg = utilities.create_numerical_test_ruleset_neg()
    classifier = RuleSetClassifier([ruleset_pos, ruleset_neg],
                                   rule_selection_method=rule_selection_method,
                                   confidence_metric=ConfidenceMetric.LAPLACE,
                                   weight_metric=WeightMetric.CONFIDENCE)
    X, y = utilities.create_numerical_test_data(20)
    classifier.update_rules_with_metrics(X, y)
    return classifier
    def test_ripper_iris(self):
        data = pd.read_csv(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
            header=None,
            names=[
                'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
                'class_name'
            ],
            dtype={
                'sepal_length': np.float,
                'sepal_width': np.float,
                'petal_length': np.float,
                'petal_width': np.float,
                'class_name': np.str
            })

        col_names = [
            'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
            'class_name'
        ]

        x_train, x_test, y_train, y_test = train_test_split(
            data.loc[:, col_names[:-1]],
            data.loc[:, col_names[-1]],
            random_state=0)
        estimator = RipperExplainer(d=2, k=2, pruning_threshold=50)
        estimator.fit(x_train, y_train)
        dnf_rule_set_list = estimator.explain_multiclass()
        classifier = RuleSetClassifier(
            dnf_rule_set_list[:2],
            rule_selection_method=RuleSelectionMethod.WEIGHTED_MAX,
            confidence_metric=ConfidenceMetric.LAPLACE,
            weight_metric=WeightMetric.CONFIDENCE,
            default_label='Iris-virginica')
        classifier.update_rules_with_metrics(x_test, y_test)
        reader = TrxfReader()
        reader.load_data_dictionary(x_train)
        serializer = NyokaSerializer(TIMESTAMP)
        exporter = PmmlExporter(reader, serializer)
        with open(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             './resources/iris.pmml')) as file:
            expected = file.read()
        actual = exporter.export(classifier)
        self.assertEqual(expected, actual)
    def test_ripper_wifi(self):
        data = pd.read_csv(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/00422/wifi_localization.txt',
            header=None,
            delimiter='\t',
            names=['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'Y'],
            dtype={
                'X1': np.float,
                'X2': np.float,
                'X3': np.float,
                'X4': np.float,
                'X5': np.float,
                'X6': np.float,
                'X7': np.float,
                'Y': str
            })

        x = data.loc[:, ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7']]
        y = data.loc[:, 'Y']

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            random_state=0)

        estimator = RipperExplainer(d=64, k=2, pruning_threshold=20)

        estimator.fit(x_train, y_train)
        dnf_rule_set_list = estimator.explain_multiclass()
        classifier = RuleSetClassifier(
            dnf_rule_set_list[:-1],
            rule_selection_method=RuleSelectionMethod.WEIGHTED_MAX,
            confidence_metric=ConfidenceMetric.LAPLACE,
            weight_metric=WeightMetric.CONFIDENCE,
            default_label='4')
        classifier.update_rules_with_metrics(x_test, y_test)
        reader = TrxfReader()
        reader.load_data_dictionary(x_train)
        serializer = NyokaSerializer(TIMESTAMP)
        exporter = PmmlExporter(reader, serializer)
        with open(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             './resources/wifi.pmml')) as file:
            expected = file.read()
        actual = exporter.export(classifier)
        self.assertEqual(expected, actual)
    def test_export_with_missing_data_dict_should_raise(self):
        reader = TrxfReader()
        serializer = NyokaSerializer()
        exporter = PmmlExporter(reader, serializer)

        test_classifier = RuleSetClassifier([(create_test_ruleset())],
                                            RuleSelectionMethod.FIRST_HIT,
                                            default_label=0)
        self.assertRaises(AssertionError, exporter.export, test_classifier)
Exemple #5
0
    def test_predict_first_hit(self):
        ruleset = utilities.create_test_ruleset()
        classifier = RuleSetClassifier(
            [ruleset],
            rule_selection_method=RuleSelectionMethod.FIRST_HIT,
            default_label=0)
        assignment1 = {
            'toto0': 0.05,
            'toto1': 'bar',
            'toto2': False,
            'toto3': -2
        }
        actual = classifier.predict(assignment1)
        expected = 1
        self.assertEqual(expected, actual)

        assignment2 = {'toto0': -5, 'toto1': 5, 'toto2': 2, 'toto3': -2}
        actual = classifier.predict(assignment2)
        expected = 0
        self.assertEqual(expected, actual)
Exemple #6
0
 def test_read(self):
     reader = TrxfReader(DATA_DICTIONARY)
     test_ruleset = create_test_ruleset()
     test_classifier = RuleSetClassifier(
         [test_ruleset],
         classifier.RuleSelectionMethod.FIRST_HIT,
         default_label=0)
     ruleset_model = RuleSetModel(miningSchema=TEST_MINING_SCHEMA,
                                  ruleSet=TEST_PMML_RULESET)
     expected = SimplePMMLRuleSetModel(dataDictionary=DATA_DICTIONARY,
                                       ruleSetModel=ruleset_model)
     self.assertEqual(reader.read(test_classifier), expected)
    def test_export(self):
        reader = TrxfReader()
        reader.load_data_dictionary(DATA_FRAME)
        serializer = NyokaSerializer(TIMESTAMP)
        exporter = PmmlExporter(reader, serializer)

        with open(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             './resources/toto.pmml')) as file:
            expected = file.read()
        test_classifier = RuleSetClassifier([(create_test_ruleset())],
                                            RuleSelectionMethod.FIRST_HIT,
                                            default_label=0)
        actual = exporter.export(test_classifier)
        self.assertEqual(expected, actual)
    def test_ripper_adult(self):
        data_type = {
            'age': np.float,
            'workclass': str,
            'fnlwgt': np.float,
            'education': str,
            'education-num': np.float,
            'marital-status': str,
            'occupation': str,
            'relationship': str,
            'race': str,
            'sex': str,
            'capital-gain': np.float,
            'capital-loss': np.float,
            'native-country': str,
            'hours-per-week': np.float,
            'label': str
        }

        col_names = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'label'
        ]

        df = pd.read_csv(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
            header=None,
            delimiter=', ',
            engine='python',
            names=col_names,
            dtype=data_type)
        df.columns = df.columns.str.replace('-', '_')

        # first tests all cells then find all rows
        df = df[(df.astype(str) != '?').all(axis=1)]
        data_train, data_test = train_test_split(df,
                                                 test_size=0.5,
                                                 random_state=1)

        train_labels = data_train.columns[:-1]
        test_label = data_test.columns[-1]

        x_train = data_train.loc[:, train_labels]
        y_train = data_train.loc[:, test_label]

        x_test = data_test.loc[:, train_labels]
        y_test = data_test.loc[:, test_label]

        generator = RipperRuleSetGenerator()
        ruleset = generator.generate(x_train,
                                     y_train,
                                     '>50K',
                                     d=64,
                                     k=2,
                                     pruning_threshold=100)
        classifier = RuleSetClassifier(
            [ruleset],
            rule_selection_method=RuleSelectionMethod.WEIGHTED_MAX,
            confidence_metric=ConfidenceMetric.LAPLACE,
            weight_metric=WeightMetric.CONFIDENCE,
            default_label='<=50K')
        classifier.update_rules_with_metrics(x_test, y_test)
        reader = TrxfReader()
        reader.load_data_dictionary(x_test)
        serializer = NyokaSerializer(TIMESTAMP)
        exporter = PmmlExporter(reader, serializer)
        with open(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             'resources/adult.pmml')) as file:
            expected = file.read()
        actual = exporter.export(classifier)
        self.assertEqual(expected, actual)