Esempio n. 1
0
 def test_predict_unfitted(self):
     rules = TrainableRules(
         base_model_name="sklearn.tree.DecisionTreeClassifier",
         prune_branches_algorithms=[],
         prune_attributes=False,
         confidence_threshold=0)
     with self.assertRaises(NotFittedError):
         rules.predict(self.test_x)
Esempio n. 2
0
 def test_budget(budget):
     rules = TrainableRules(
         "sklearn.tree.DecisionTreeClassifier",
         prune_branches_algorithms=["top-down-greedy"],
         prune_attributes=False,
         top_down_greedy_budget=(False, budget),
         random_state=1989)
     rules.fit(self.train_x, self.train_y)
     return rules.score(self.train_x, self.train_y)
Esempio n. 3
0
 def test_predict_winner_indices(self):
     rules = TrainableRules("sklearn.tree.DecisionTreeClassifier",
                            prune_branches_algorithms=[],
                            prune_attributes=False,
                            min_samples_leaf=26,
                            random_state=1989)
     rules.fit(self.train_x, self.train_y)
     pred_y, winners = rules.rules.predict(self.train_x,
                                           return_winner_indices=True)
     for ycls, w in zip(pred_y, winners):
         self.assertEqual(ycls, rules.rules.rules[w].stats.cls)
Esempio n. 4
0
 def test_tree_attr_pruning(self):
     model = tree.DecisionTreeClassifier(min_samples_leaf=26,
                                         random_state=1989)
     model = model.fit(self.train_x, self.train_y)
     rules = TrainableRules("sklearn.tree.DecisionTreeClassifier",
                            prune_branches_algorithms=[],
                            prune_attributes=True,
                            min_samples_leaf=26,
                            random_state=1989)
     rules.fit(self.train_x, self.train_y)
     tree_score = model.score(self.test_x, self.test_y)
     rules_score = rules.score(self.test_x, self.test_y)
     self.assertGreater(rules_score * 1.1, tree_score)
Esempio n. 5
0
 def test_tree_no_pruning(self):
     model = tree.DecisionTreeClassifier(min_samples_leaf=26,
                                         random_state=1989)
     model = model.fit(self.train_x, self.train_y)
     rules = TrainableRules(
         base_model_name="sklearn.tree.DecisionTreeClassifier",
         prune_branches_algorithms=[],
         confidence_threshold=0,
         prune_attributes=False,
         min_samples_leaf=26,
         random_state=1989)
     rules.fit(self.train_x, self.train_y)
     tree_score = model.score(self.train_x, self.train_y)
     rules_score = rules.score(self.train_x, self.train_y)
     self.assertGreater(rules_score * 1.1, tree_score)
Esempio n. 6
0
 def test_forest_no_pruning(self):
     model = ensemble.RandomForestClassifier(n_estimators=50,
                                             min_samples_leaf=26,
                                             random_state=1989)
     model = model.fit(self.train_x, self.train_y)
     rules = TrainableRules("sklearn.ensemble.RandomForestClassifier",
                            prune_branches_algorithms=[],
                            prune_attributes=False,
                            n_estimators=50,
                            min_samples_leaf=26,
                            random_state=1989)
     rules.fit(self.train_x, self.train_y)
     forest_score = model.score(self.train_x, self.train_y)
     rules_score = rules.score(self.train_x, self.train_y)
     self.assertGreater(rules_score * 1.1, forest_score)
Esempio n. 7
0
 def test_rules_estimator(self):
     estimator = TrainableRules("sklearn.tree.DecisionTreeClassifier",
                                prune_branches_algorithms=[],
                                prune_attributes=False)
     scores = model_selection.cross_val_score(estimator, self.x, self.y)
     score = sum(scores) / len(scores)
     self.assertGreater(score, .5)
Esempio n. 8
0
 def setUp(self):
     (self.train_x, self.test_x, self.train_y,
      self.test_y), _, _ = load_abalone_data()
     self.config = {
         "trainable_rules": {
             "base_model_name": "sklearn.tree.DecisionTreeClassifier",
             "prune_branches_algorithms": [],
             "prune_attributes": False,
             "min_samples_leaf": 26,
             "random_state": 1989,
         },
     }
     trainer = TrainableRules(**self.config["trainable_rules"],
                              origin_config=self.config)
     trainer.fit(self.test_x, self.test_y)
     self.rules = trainer.rules
     self.fm = FormatModel().load(
         os.path.join(os.path.dirname(__file__), "model_jquery.asdf"))
     self.maxDiff = None
Esempio n. 9
0
    def test_integration(self):
        res = self.extractor.extract_features(self.files)
        self.assertIsNotNone(res, "Failed to parse files.")
        X, y, _, = res
        train_X, test_X, train_y, test_y = \
            model_selection.train_test_split(X, y, random_state=1989)

        model = tree.DecisionTreeClassifier(min_samples_leaf=26,
                                            random_state=1989,
                                            max_depth=None,
                                            max_features="auto",
                                            min_samples_split=2)
        model.fit(train_X, train_y)
        rules = TrainableRules(
            base_model_name="sklearn.tree.DecisionTreeClassifier",
            prune_branches_algorithms=[],
            prune_attributes=False,
            min_samples_leaf=26,
            random_state=1989,
            max_depth=None,
            max_features="auto",
            min_samples_split=2,
            confidence_threshold=0)
        rules.fit(train_X, train_y)
        model_score_train = model.score(train_X, train_y)
        model_score_test = model.score(test_X, test_y)
        rules_score_train = rules.score(train_X, train_y)
        rules_score_test = rules.score(test_X, test_y)
        self.assertEqual(rules_score_train, model_score_train)
        self.assertEqual(rules_score_test, model_score_test)
Esempio n. 10
0
    def test_integration(self):
        X, y, _ = self.extractor.extract_features(self.files)
        train_X, test_X, train_y, test_y = \
            model_selection.train_test_split(X, y, random_state=1989)

        model = tree.DecisionTreeClassifier(min_samples_leaf=26,
                                            random_state=1989,
                                            max_depth=None,
                                            max_features="auto",
                                            min_samples_split=2)
        model.fit(train_X, train_y)
        rules = TrainableRules("sklearn.tree.DecisionTreeClassifier",
                               prune_branches_algorithms=[],
                               prune_attributes=False,
                               min_samples_leaf=26,
                               random_state=1989,
                               max_depth=None,
                               max_features="auto",
                               min_samples_split=2)
        rules.fit(train_X, train_y)
        model_score_train = model.score(train_X, train_y)
        model_score_test = model.score(test_X, test_y)
        rules_score_train = rules.score(train_X, train_y)
        rules_score_test = rules.score(test_X, test_y)
        self.assertEqual(rules_score_train, model_score_train)
        self.assertEqual(rules_score_test, model_score_test)
Esempio n. 11
0
    def test_reduced_error_prune(self):
        LEAF = _tree.TREE_LEAF
        UNDEFINED = _tree.TREE_UNDEFINED

        class FakeFeature:
            def __getitem__(self, item):
                pass

            def __setitem__(self, key, value):
                pass

        class FakeTree:
            def __init__(self, *args):
                self.children_left = numpy.array(args[0])
                self.children_right = numpy.array(args[1])
                self.feature = FakeFeature()
                self.value = numpy.array([
                    [[50, 100]],
                    [[45, 50]],
                    [[5, 40]],
                    [[40, 10]],
                    [[2, 20]],
                    [[3, 20]],
                    [[20, 5]],
                    [[20, 5]],
                    [[1, 19]],
                    [[2, 1]],
                    [[5, 50]],
                ])

        class FakeModel:
            def __init__(self):
                self.tree_ = FakeTree(
                    [1, 2, 4, 6, 8, LEAF, LEAF, LEAF, LEAF, LEAF, LEAF],
                    [10, 3, 5, 7, 9, LEAF, LEAF, LEAF, LEAF, LEAF, LEAF],
                )
                self.classes_ = [0, 1]

            def decision_path(self, X):
                return sparse.csr_matrix([
                    [1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
                    [1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0],
                    [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
                    [1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
                    [1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                ])

            def predict(self, X):
                return numpy.array([1, 0, 1, 0, 0, 1])

        class FakeX:
            shape = [6]

        model = FakeModel()
        pruned_model = TrainableRules._prune_reduced_error(
            model, FakeX, numpy.array([1, 1, 1, 0, 0, 1]))
        self.assertEqual(list(pruned_model.tree_.children_left), [
            1, 2, LEAF, LEAF, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED,
            UNDEFINED, UNDEFINED, LEAF
        ])
        self.assertEqual(list(pruned_model.tree_.children_right), [
            10, 3, LEAF, LEAF, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED,
            UNDEFINED, UNDEFINED, LEAF
        ])
Esempio n. 12
0
    def train(cls, ptr: ReferencePointer, config: Dict[str, Any],
              data_request_stub: DataStub, **data) -> AnalyzerModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_request_stub: connection to the Lookout data retrieval service, not used.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        config = cls._load_train_config(config)
        cls.log.info("train %s %s %s", ptr.url, ptr.commit,
                     pformat(config, width=4096, compact=True))
        files_by_language = cls._files_by_language(data["files"])
        model = FormatModel().construct(cls, ptr)
        for language, files in files_by_language.items():
            language = language.lower()
            try:
                fe = FeatureExtractor(
                    language=language,
                    siblings_window=config["siblings_window"],
                    parents_depth=config["parents_depth"])
            except ImportError:
                cls.log.warning("skipped %d %s files - not supported",
                                len(files), language)
                continue
            else:
                cls.log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(f[1] for f in sorted(files.items()))
            lower_bound_instances = config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                cls.log.warning("skipped %d %s files: too few samples (%d/%d)",
                                len(files), language, X.shape[0],
                                lower_bound_instances)
                continue
            cls.log.debug("training the rules model")
            bscv = BayesSearchCV(TrainableRules(
                prune_branches_algorithms=config["prune_branches_algorithms"],
                prune_attributes=config["prune_attributes"],
                top_down_greedy_budget=config["top_down_greedy_budget"],
                uncertain_attributes=config["uncertain_attributes"],
                prune_dataset_ratio=config["prune_dataset_ratio"],
                n_estimators=config["n_estimators"],
                random_state=config["random_state"]), {
                    "base_model_name":
                    Categorical([
                        "sklearn.ensemble.RandomForestClassifier",
                        "sklearn.tree.DecisionTreeClassifier"
                    ]),
                    "max_depth":
                    Categorical([None, 5, 10]),
                    "max_features":
                    Categorical([None, "auto"]),
                    "min_samples_split":
                    Integer(2, 20),
                    "min_samples_leaf":
                    Integer(1, 20)
                },
                                 n_jobs=-1,
                                 n_iter=config["n_iter"],
                                 random_state=config["random_state"])
            bscv.fit(X, y)
            cls.log.debug("score of the best estimator found: %.3f",
                          bscv.best_score_)
            cls.log.debug("params of the best estimator found: %s",
                          str(bscv.best_params_))
            cls.log.debug("training the model with complete data")
            trainable_rules = TrainableRules(
                prune_branches_algorithms=["reduced-error"],
                prune_attributes=True,
                random_state=42,
                uncertain_attributes=True,
                **bscv.best_params_)
            trainable_rules.fit(X, y)
            model[language] = trainable_rules.rules
        cls.log.info("trained %s", model)
        return model
Esempio n. 13
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService,
              files: Iterator[File], **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :param files: iterator of File records from the data service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        train_config = cls._load_config(config)["train"]
        _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit,
                  pformat(train_config, width=4096, compact=True))
        model = FormatModel().generate(cls, ptr)
        for language, files in files_by_language(files).items():
            try:
                lang_config = train_config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            _log.info("effective train config for %s:\n%s", language,
                      pformat(lang_config, width=120, compact=True))
            random_state = lang_config["random_state"]
            files = filter_files(
                files, lang_config["line_length_limit"], lang_config["overall_size_limit"],
                random_state, _log)
            submit_event("%s.train.%s.files" % (cls.name, language), len(files))
            if len(files) == 0:
                _log.info("zero files after filtering, language %s is skipped.", language)
                continue
            try:
                fe = FeatureExtractor(language=language, **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files), language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            train_files, test_files = FormatAnalyzer.split_train_test(
                files, lang_config["test_dataset_ratio"], random_state=random_state)
            # ensure that the features are reproducible
            train_files = sorted(train_files, key=lambda x: x.path)
            test_files = sorted(test_files, key=lambda x: x.path)
            X_train, y_train, _ = fe.extract_features(train_files)
            X_train, selected_features = fe.select_features(X_train, y_train)
            if test_files:
                X_test, y_test, _ = fe.extract_features(test_files)
            if lang_config["test_dataset_ratio"]:
                _log.debug("Real test ratio is %.3f",
                           X_test.shape[0] / (X_test.shape[0] + X_train.shape[0])
                           if test_files else 0)
            lang_config["feature_extractor"]["selected_features"] = selected_features
            lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X_train.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X_train.shape[0], lower_bound_instances)
                continue
            _log.info("extracted %d samples to train, searching for the best hyperparameters",
                      X_train.shape[0])
            optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state)
            best_score, best_params = optimizer.optimize(X_train, y_train)
            if _log.isEnabledFor(logging.DEBUG):
                _log.debug("score of the best estimator found: %.6f", best_score)
                _log.debug("params of the best estimator found: %s", str(best_params))
                _log.debug("training the model with complete data")
            else:
                _log.info("finished hyperopt at %.6f, training the full model", -best_score)
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             random_state=random_state,
                                             origin_config=lang_config)
            trainable_rules.fit(X_train, y_train)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"],
                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
            trainable_rules.prune_categorical_attributes(fe)
            _log.info("obtained %d rules, generating the classification report",
                      len(trainable_rules.rules))
            trainable_rules.rules.generate_classification_report(
                X_train, y_train, "train", fe.composite_class_representations)
            if test_files:
                trainable_rules.rules.generate_classification_report(
                    X_test, y_test, "test", fe.composite_class_representations)
            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipped.", language)
        _log.info("trained %s", model)
        return model
Esempio n. 14
0
 def test_predict_unfitted(self):
     rules = TrainableRules("sklearn.tree.DecisionTreeClassifier",
                            prune_branches_algorithms=[],
                            prune_attributes=False)
     with self.assertRaises(NotFittedError):
         rules.predict(self.test_x)
Esempio n. 15
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        _log.info("train %s %s %s", ptr.url, ptr.commit,
                  pformat(config, width=4096, compact=True))
        model = FormatModel().construct(cls, ptr)
        config = cls._load_train_config(config)
        for language, files in files_by_language(data["files"]).items():
            try:
                lang_config = config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            files = filter_files(files, lang_config["line_length_limit"], _log)
            submit_event("%s.train.%s.files" % (cls.name, language),
                         len(files))
            if len(files) == 0:
                _log.info(
                    "zero files after filtering, language %s is skipped.",
                    language)
                continue
            try:
                fe = FeatureExtractor(language=language,
                                      **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files),
                             language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path))
            X, selected_features = fe.select_features(X, y)
            lang_config["feature_extractor"][
                "selected_features"] = selected_features
            lang_config["feature_extractor"][
                "label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X.shape[0],
                             lower_bound_instances)
                continue
            _log.debug("training the rules model")
            optimizer = Optimizer(
                n_jobs=lang_config["n_jobs"],
                n_iter=lang_config["n_iter"],
                cv=lang_config["cv"],
                random_state=lang_config["trainable_rules"]["random_state"])
            best_score, best_params = optimizer.optimize(X, y)
            _log.debug("score of the best estimator found: %.6f", best_score)
            _log.debug("params of the best estimator found: %s",
                       str(best_params))
            _log.debug("training the model with complete data")
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             origin_config=lang_config)
            trainable_rules.fit(X, y)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"], "\n\t".join(
                    "%-55s %.5E" % (fe.feature_names[i], importances[i])
                    for i in numpy.argsort(-importances)[:25]
                    if importances[i] > 1e-5))
            submit_event("%s.train.%s.rules" % (cls.name, language),
                         len(trainable_rules.rules))
            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
            # throw away imprecise classes
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipping.", language)
        _log.info("trained %s", model)
        return model