def test_save_and_load(self):
     fm1 = FormatModel()
     fm1["js"] = self.rules
     fm1["js2"] = self.rules
     fm1["js3"] = self.rules
     with tempfile.NamedTemporaryFile(prefix="lookout-") as f:
         fm1.save(f.name)
         fm2 = FormatModel().load(f.name)
         compare_models(self, fm1, fm2)
    def test_dump(self):
        fm = FormatModel()
        self.assertEqual(
            fm.dump(),
            "<unknown name>/[1, 0, 0] <unknown url> <unknown commit>")

        DUMP = """style.format.analyzer.FormatAnalyzer/[3] https://github.com/jquery/jquery c2026b117d1ca5b2e42a52c7e2a8ae8988cf0d4b

# javascript
1269 rules, avg.len. 19.1"""  # noqa
        self.assertEqual(self.fm.dump(), DUMP)
 def test_len(self):
     fm = FormatModel()
     self.assertEqual(len(fm), 0)
     fm["js"] = self.rules
     self.assertEqual(len(fm), 1)
     fm["js2"] = self.rules
     self.assertEqual(len(fm), 2)
Example #4
0
def analyze_files(
    analyzer_type: Type[FormatAnalyzer],
    config: dict,
    model_path: str,
    language: str,
    bblfsh_addr: str,
    input_pattern: str,
    log: logging.Logger,
) -> List[Comment]:
    """Run the model, record the fixes for each file and return them."""
    class FakePointer:
        def to_pb(self):
            return None

    model = FormatModel().load(model_path)
    if language not in model:
        raise NotFittedError()
    rules = model[language]
    client = bblfsh.BblfshClient(bblfsh_addr)
    files = parse_files(
        filepaths=glob.glob(input_pattern, recursive=True),
        line_length_limit=rules.origin_config["line_length_limit"],
        overall_size_limit=rules.origin_config["overall_size_limit"],
        client=client,
        language=language,
        log=log)
    log.info("Model parameters: %s" % rules.origin_config)
    log.info("Rules stats: %s" % rules)
    log.info("Number of files: %s" % (len(files)))
    return analyzer_type(model, input_pattern,
                         config).analyze(FakePointer(),
                                         None,
                                         data_service=FakeDataService(
                                             client, files, []))
class FormatModelTests(unittest.TestCase):
    def setUp(self):
        (self.train_x, self.test_x, self.train_y,
         self.test_y), _, _ = load_abalone_data()
        self.config = {
            "trainable_rules": {
                "base_model_name": "sklearn.tree.DecisionTreeClassifier",
                "prune_branches_algorithms": [],
                "prune_attributes": False,
                "min_samples_leaf": 26,
                "random_state": 1989,
            },
        }
        trainer = TrainableRules(**self.config["trainable_rules"],
                                 origin_config=self.config)
        trainer.fit(self.test_x, self.test_y)
        self.rules = trainer.rules
        self.fm = FormatModel().load(
            os.path.join(os.path.dirname(__file__), "model_jquery.asdf"))
        self.maxDiff = None

    def test_save_and_load(self):
        fm1 = FormatModel()
        fm1["js"] = self.rules
        fm1["js2"] = self.rules
        fm1["js3"] = self.rules
        with tempfile.NamedTemporaryFile(prefix="lookout-") as f:
            fm1.save(f.name)
            fm2 = FormatModel().load(f.name)
            compare_models(self, fm1, fm2)

    def test_dump(self):
        fm = FormatModel()
        self.assertEqual(
            fm.dump(),
            "<unknown name>/[1, 0, 0] <unknown url> <unknown commit>")

        DUMP = """style.format.analyzer.FormatAnalyzer/[3] https://github.com/jquery/jquery c2026b117d1ca5b2e42a52c7e2a8ae8988cf0d4b

# javascript
1269 rules, avg.len. 19.1"""  # noqa
        self.assertEqual(self.fm.dump(), DUMP)

    def test_len(self):
        fm = FormatModel()
        self.assertEqual(len(fm), 0)
        fm["js"] = self.rules
        self.assertEqual(len(fm), 1)
        fm["js2"] = self.rules
        self.assertEqual(len(fm), 2)

    def test_iter(self):
        langs = set(self.fm.languages)
        for item in self.fm:
            self.assertIn(item, langs)
            langs.remove(item)
        self.assertEqual(len(langs), 0)
 def setUp(self):
     (self.train_x, self.test_x, self.train_y,
      self.test_y), _, _ = load_abalone_data()
     self.config = {
         "trainable_rules": {
             "base_model_name": "sklearn.tree.DecisionTreeClassifier",
             "prune_branches_algorithms": [],
             "prune_attributes": False,
             "min_samples_leaf": 26,
             "random_state": 1989,
         },
     }
     trainer = TrainableRules(**self.config["trainable_rules"],
                              origin_config=self.config)
     trainer.fit(self.test_x, self.test_y)
     self.rules = trainer.rules
     self.fm = FormatModel().load(
         os.path.join(os.path.dirname(__file__), "model_jquery.asdf"))
     self.maxDiff = None
Example #7
0
def return_features() -> Response:
    """Featurize the given code."""
    body = request.get_json()
    code = body["code"]
    babelfish_address = body["babelfish_address"]
    language = body["language"]
    client = BblfshClient(babelfish_address)
    res = client.parse(filename="", contents=code.encode(), language=language)
    if res.status != 0:
        abort(500)
    model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf"))
    if language not in model:
        raise NotFittedError()
    rules = model[language]
    file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path")
    config = rules.origin_config["feature_extractor"]
    config["return_sibling_indices"] = True
    fe = FeatureExtractor(language=language, **config)
    res = fe.extract_features([file])
    if res is None:
        abort(500)
    X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res
    y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict(
        X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe)
    refuse_to_predict = y_pred < 0
    checker = UASTStabilityChecker(fe)
    _, _, _, _, safe_preds = checker.check(
        y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub,
        vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners,
        grouped_quote_predictions=grouped_quote_predictions)
    break_uast = [False] * X.shape[0]
    for wrong_pred in set(range(X.shape[0])).difference(safe_preds):
        break_uast[wrong_pred] = True
    labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)}
    app.logger.info("returning features of shape %d, %d" % X.shape)
    app.logger.info("length of rules: %d", len(rules))
    return jsonify({
        "code": code,
        "features": _input_matrix_to_descriptions(X, fe),
        "ground_truths": y.tolist(),
        "predictions": y_pred.tolist(),
        "refuse_to_predict": refuse_to_predict.tolist(),
        "sibling_indices": sibling_indices,
        "rules": _rules_to_jsonable(rules, fe),
        "winners": rule_winners.tolist(),
        "break_uast": break_uast,
        "feature_names": fe.feature_names,
        "class_representations": fe.composite_class_representations,
        "class_printables": fe.composite_class_printables,
        "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)),
        "config": _mapping_to_jsonable(rules.origin_config)})
Example #8
0
 def test_train_cutoff_labels(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertIsInstance(model1, FormatModel)
     self.assertIn("javascript", model1, str(model1))
     model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertEqual(model1["javascript"].rules, model2["javascript"].rules)
     self.assertGreater(len(model1["javascript"]), 5)
     # Check that model can be saved without problems and then load back
     with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f:
         model2.save(f)
         f.seek(0)
         model3 = FormatModel().load(f)
         compare_models(self, model2, model3)
Example #9
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, **data) -> FormatModel:
        """
        Train a model given the files available or load the existing model.

        If you set config["model"] to path in the file system model will be loaded otherwise
        a model is trained in a regular way.

        :param ptr: Git repository state pointer.
        :param config: Configuration dict.
        :param data: Contains "files" - the list of files in the pointed state.
        :param data_service: Connection to the Lookout data retrieval service.
        :return: FormatModel containing the learned rules, per language.
        """
        return FormatModel().load(config["model"]) if "model" in config else \
            super().train(ptr, config, data_service)
Example #10
0
def analyze_files(analyzer_type: Type[FormatAnalyzer], config: dict, model_path: str,
                  language: str, bblfsh: str, input_pattern: str, log: logging.Logger,
                  ) -> List[FileFix]:
    """Run the model, record the fixes for each file and return them."""
    class FakePointer:
        def to_pb(self):
            return None

    model = FormatModel().load(model_path)
    if language not in model:
        raise NotFittedError()
    rules = model[language]
    client = BblfshClient(bblfsh)
    files = prepare_files(glob.glob(input_pattern, recursive=True), client, language)
    log.info("Model parameters: %s" % rules.origin_config)
    log.info("Rules stats: %s" % rules)
    log.info("Number of files: %s" % (len(files)))
    return analyzer_type(model, input_pattern, config).analyze(
        FakePointer(), None, data_service=FakeDataService(client, files, []))
Example #11
0
    def train(cls, ptr: ReferencePointer, config: Dict[str, Any],
              data_request_stub: DataStub, **data) -> AnalyzerModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_request_stub: connection to the Lookout data retrieval service, not used.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        config = cls._load_train_config(config)
        cls.log.info("train %s %s %s", ptr.url, ptr.commit,
                     pformat(config, width=4096, compact=True))
        files_by_language = cls._files_by_language(data["files"])
        model = FormatModel().construct(cls, ptr)
        for language, files in files_by_language.items():
            language = language.lower()
            try:
                fe = FeatureExtractor(
                    language=language,
                    siblings_window=config["siblings_window"],
                    parents_depth=config["parents_depth"])
            except ImportError:
                cls.log.warning("skipped %d %s files - not supported",
                                len(files), language)
                continue
            else:
                cls.log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(f[1] for f in sorted(files.items()))
            lower_bound_instances = config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                cls.log.warning("skipped %d %s files: too few samples (%d/%d)",
                                len(files), language, X.shape[0],
                                lower_bound_instances)
                continue
            cls.log.debug("training the rules model")
            bscv = BayesSearchCV(TrainableRules(
                prune_branches_algorithms=config["prune_branches_algorithms"],
                prune_attributes=config["prune_attributes"],
                top_down_greedy_budget=config["top_down_greedy_budget"],
                uncertain_attributes=config["uncertain_attributes"],
                prune_dataset_ratio=config["prune_dataset_ratio"],
                n_estimators=config["n_estimators"],
                random_state=config["random_state"]), {
                    "base_model_name":
                    Categorical([
                        "sklearn.ensemble.RandomForestClassifier",
                        "sklearn.tree.DecisionTreeClassifier"
                    ]),
                    "max_depth":
                    Categorical([None, 5, 10]),
                    "max_features":
                    Categorical([None, "auto"]),
                    "min_samples_split":
                    Integer(2, 20),
                    "min_samples_leaf":
                    Integer(1, 20)
                },
                                 n_jobs=-1,
                                 n_iter=config["n_iter"],
                                 random_state=config["random_state"])
            bscv.fit(X, y)
            cls.log.debug("score of the best estimator found: %.3f",
                          bscv.best_score_)
            cls.log.debug("params of the best estimator found: %s",
                          str(bscv.best_params_))
            cls.log.debug("training the model with complete data")
            trainable_rules = TrainableRules(
                prune_branches_algorithms=["reduced-error"],
                prune_attributes=True,
                random_state=42,
                uncertain_attributes=True,
                **bscv.best_params_)
            trainable_rules.fit(X, y)
            model[language] = trainable_rules.rules
        cls.log.info("trained %s", model)
        return model
Example #12
0
def dump_rule_entry(model, hash):
    """Command-line entry for "tool rule"."""
    model = FormatModel().load(model)
    dump_rule(model, hash)
Example #13
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService,
              files: Iterator[File], **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :param files: iterator of File records from the data service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        train_config = cls._load_config(config)["train"]
        _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit,
                  pformat(train_config, width=4096, compact=True))
        model = FormatModel().generate(cls, ptr)
        for language, files in files_by_language(files).items():
            try:
                lang_config = train_config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            _log.info("effective train config for %s:\n%s", language,
                      pformat(lang_config, width=120, compact=True))
            random_state = lang_config["random_state"]
            files = filter_files(
                files, lang_config["line_length_limit"], lang_config["overall_size_limit"],
                random_state, _log)
            submit_event("%s.train.%s.files" % (cls.name, language), len(files))
            if len(files) == 0:
                _log.info("zero files after filtering, language %s is skipped.", language)
                continue
            try:
                fe = FeatureExtractor(language=language, **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files), language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            train_files, test_files = FormatAnalyzer.split_train_test(
                files, lang_config["test_dataset_ratio"], random_state=random_state)
            # ensure that the features are reproducible
            train_files = sorted(train_files, key=lambda x: x.path)
            test_files = sorted(test_files, key=lambda x: x.path)
            X_train, y_train, _ = fe.extract_features(train_files)
            X_train, selected_features = fe.select_features(X_train, y_train)
            if test_files:
                X_test, y_test, _ = fe.extract_features(test_files)
            if lang_config["test_dataset_ratio"]:
                _log.debug("Real test ratio is %.3f",
                           X_test.shape[0] / (X_test.shape[0] + X_train.shape[0])
                           if test_files else 0)
            lang_config["feature_extractor"]["selected_features"] = selected_features
            lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X_train.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X_train.shape[0], lower_bound_instances)
                continue
            _log.info("extracted %d samples to train, searching for the best hyperparameters",
                      X_train.shape[0])
            optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state)
            best_score, best_params = optimizer.optimize(X_train, y_train)
            if _log.isEnabledFor(logging.DEBUG):
                _log.debug("score of the best estimator found: %.6f", best_score)
                _log.debug("params of the best estimator found: %s", str(best_params))
                _log.debug("training the model with complete data")
            else:
                _log.info("finished hyperopt at %.6f, training the full model", -best_score)
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             random_state=random_state,
                                             origin_config=lang_config)
            trainable_rules.fit(X_train, y_train)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"],
                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
            trainable_rules.prune_categorical_attributes(fe)
            _log.info("obtained %d rules, generating the classification report",
                      len(trainable_rules.rules))
            trainable_rules.rules.generate_classification_report(
                X_train, y_train, "train", fe.composite_class_representations)
            if test_files:
                trainable_rules.rules.generate_classification_report(
                    X_test, y_test, "test", fe.composite_class_representations)
            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipped.", language)
        _log.info("trained %s", model)
        return model
Example #14
0
class FormatModelTests(unittest.TestCase):
    def setUp(self):
        (self.train_x, self.test_x, self.train_y,
         self.test_y), _, _ = load_abalone_data()
        self.config = {
            "trainable_rules": {
                "base_model_name": "sklearn.tree.DecisionTreeClassifier",
                "prune_branches_algorithms": [],
                "prune_attributes": False,
                "min_samples_leaf": 26,
                "random_state": 1989,
            },
        }
        trainer = TrainableRules(**self.config["trainable_rules"],
                                 origin_config=self.config)
        trainer.fit(self.test_x, self.test_y)
        self.rules = trainer.rules
        self.fm = FormatModel().load(
            os.path.join(os.path.dirname(__file__), "model_jquery.asdf"))
        self.maxDiff = None

    def test_save_and_load(self):
        fm1 = FormatModel()
        fm1["js"] = self.rules
        fm1["js2"] = self.rules
        fm1["js3"] = self.rules
        with tempfile.NamedTemporaryFile(prefix="lookout-") as f:
            fm1.save(f.name)
            fm2 = FormatModel().load(f.name)
            compare_models(self, fm1, fm2)

    def test_dump(self):
        fm = FormatModel()
        self.assertEqual(fm.dump(),
                         "generic/[1, 0, 0] <unknown url> <unknown commit>")

        DUMP = """code-format/[1] file:///var/folders/kw/93jybvs16_954hytgsq6ld7r0000gn/T/top-repos-quality-repos-jigt1n8g/jquery dae5f3ce3d2df27873d01f0d9682f6a91ad66b87

# javascript
1159 rules, avg.len. 12.7
## train
PPCR: 0.993413
### report
macro
{'f1-score': 0.7270769669476458,
 'precision': 0.8106858458605273,
 'recall': 0.7061608014058862,
 'support': 163931}
micro
{'f1-score': 0.9704570825530253,
 'precision': 0.9704570825530253,
 'recall': 0.9704570825530253,
 'support': 163931}
weighted
{'f1-score': 0.9682573644719648,
 'precision': 0.9688067324990776,
 'recall': 0.9704570825530253,
 'support': 163931}
### report_full
macro
{'f1-score': 0.7207757082876136,
 'precision': 0.8106858458605273,
 'recall': 0.6958571203075536,
 'support': 165018}
micro
{'f1-score': 0.9672502424387974,
 'precision': 0.9704570825530253,
 'recall': 0.9640645262941012,
 'support': 165018}
weighted
{'f1-score': 0.964254372281313,
 'precision': 0.967999533892513,
 'recall': 0.9640645262941012,
 'support': 165018}
## test
PPCR: 0.992673
### report
macro
{'f1-score': 0.670106403195044,
 'precision': 0.7675483060510728,
 'recall': 0.667193540547618,
 'support': 39563}
micro
{'f1-score': 0.9646134014104087,
 'precision': 0.9646134014104087,
 'recall': 0.9646134014104087,
 'support': 39563}
weighted
{'f1-score': 0.9623528642977015,
 'precision': 0.964064574202937,
 'recall': 0.9646134014104087,
 'support': 39563}
### report_full
macro
{'f1-score': 0.6645299678762785,
 'precision': 0.7675483060510728,
 'recall': 0.6569592424077594,
 'support': 39855}
micro
{'f1-score': 0.961066760683976,
 'precision': 0.9646134014104087,
 'recall': 0.9575461046292811,
 'support': 39855}
weighted
{'f1-score': 0.9579239894541836,
 'precision': 0.9627543953777487,
 'recall': 0.9575461046292811,
 'support': 39855}"""  # noqa
        self.assertEqual(self.fm.dump(), DUMP)

    def test_len(self):
        fm = FormatModel()
        self.assertEqual(len(fm), 0)
        fm["js"] = self.rules
        self.assertEqual(len(fm), 1)
        fm["js2"] = self.rules
        self.assertEqual(len(fm), 2)

    def test_iter(self):
        langs = set(self.fm.languages)
        for item in self.fm:
            self.assertIn(item, langs)
            langs.remove(item)
        self.assertEqual(len(langs), 0)
Example #15
0
    def test_dump(self):
        fm = FormatModel()
        self.assertEqual(fm.dump(),
                         "generic/[1, 0, 0] <unknown url> <unknown commit>")

        DUMP = """code-format/[1] file:///var/folders/kw/93jybvs16_954hytgsq6ld7r0000gn/T/top-repos-quality-repos-jigt1n8g/jquery dae5f3ce3d2df27873d01f0d9682f6a91ad66b87

# javascript
1159 rules, avg.len. 12.7
## train
PPCR: 0.993413
### report
macro
{'f1-score': 0.7270769669476458,
 'precision': 0.8106858458605273,
 'recall': 0.7061608014058862,
 'support': 163931}
micro
{'f1-score': 0.9704570825530253,
 'precision': 0.9704570825530253,
 'recall': 0.9704570825530253,
 'support': 163931}
weighted
{'f1-score': 0.9682573644719648,
 'precision': 0.9688067324990776,
 'recall': 0.9704570825530253,
 'support': 163931}
### report_full
macro
{'f1-score': 0.7207757082876136,
 'precision': 0.8106858458605273,
 'recall': 0.6958571203075536,
 'support': 165018}
micro
{'f1-score': 0.9672502424387974,
 'precision': 0.9704570825530253,
 'recall': 0.9640645262941012,
 'support': 165018}
weighted
{'f1-score': 0.964254372281313,
 'precision': 0.967999533892513,
 'recall': 0.9640645262941012,
 'support': 165018}
## test
PPCR: 0.992673
### report
macro
{'f1-score': 0.670106403195044,
 'precision': 0.7675483060510728,
 'recall': 0.667193540547618,
 'support': 39563}
micro
{'f1-score': 0.9646134014104087,
 'precision': 0.9646134014104087,
 'recall': 0.9646134014104087,
 'support': 39563}
weighted
{'f1-score': 0.9623528642977015,
 'precision': 0.964064574202937,
 'recall': 0.9646134014104087,
 'support': 39563}
### report_full
macro
{'f1-score': 0.6645299678762785,
 'precision': 0.7675483060510728,
 'recall': 0.6569592424077594,
 'support': 39855}
micro
{'f1-score': 0.961066760683976,
 'precision': 0.9646134014104087,
 'recall': 0.9575461046292811,
 'support': 39855}
weighted
{'f1-score': 0.9579239894541836,
 'precision': 0.9627543953777487,
 'recall': 0.9575461046292811,
 'support': 39855}"""  # noqa
        self.assertEqual(self.fm.dump(), DUMP)
def quality_report_noisy(bblfsh: str,
                         language: str,
                         confidence_threshold: float,
                         support_threshold: int,
                         precision_threshold: float,
                         dir_output: str,
                         repos: Optional[str] = None) -> None:
    """
    Generate a quality report on the artificial noisy dataset including a precision-recall curve.

    :param bblfsh: Babelfish client. Babelfish server should be started accordingly.
    :param language: Language to consider, others will be discarded.
    :param confidence_threshold: Confidence threshold to filter relevant rules.
    :param support_threshold: Support threshold to filter relevant rules.
    :param precision_threshold: Precision threshold tolerated by the model. \
           Limit drawn as a red horizontal line on the figure.
    :param dir_output: Path to the output directory where to store the quality report in Markdown \
           and the precision-recall curve in png format.
    :param repos: Input list of urls or paths to the repositories to analyze. \
           Should be strings separated by newlines.
    """
    log = logging.getLogger("quality_report_noisy")
    repo_names = []
    last_accepted_rule = {}
    precisions, recalls, accepted_rules = (defaultdict(list) for _ in range(3))
    n_mistakes, prec_max_rec, confidence_threshold_exp, max_rec, \
        n_rules, n_rules_filtered = ({} for _ in range(6))
    if repos is None:
        repos = REPOSITORIES
    try:
        client = BblfshClient(bblfsh)
        log.info("Repositories: %s", repos)
        with tempfile.TemporaryDirectory() as tmpdirname:
            for repo_path in repos.splitlines():
                repo = repo_path.split("/")[-1]
                if repo_path.startswith("https://github.com"):
                    log.info("Fetching %s", repo_path)
                    git_dir = os.path.join(tmpdirname, repo)
                    git_dir_noisy = os.path.join(tmpdirname, repo + "_noisy")
                    cmd1 = "git clone --single-branch --branch master %s %s" % (
                        repo_path, git_dir)
                    cmd2 = "git clone --single-branch --branch style-noise-1-per-file %s %s" \
                        % (repo_path, git_dir_noisy)
                    try:
                        for cmd in (cmd1, cmd2):
                            log.debug("Running: %s", cmd)
                            subprocess.check_call(cmd.split())
                    except subprocess.CalledProcessError as e:
                        raise ConnectionError("Unable to fetch repository %s" %
                                              repo_path) from e
                    input_pattern = os.path.join(git_dir, "**", "*.js")
                    input_pattern_noisy = os.path.join(git_dir_noisy, "**",
                                                       "*.js")
                    model_path = os.path.join(git_dir_noisy,
                                              "style-analyzer-model",
                                              "model.asdf")
                else:
                    input_pattern = os.path.join(repo_path, "**", "*.js")
                    input_pattern_noisy = os.path.join(repo_path + "_noisy",
                                                       "**", "*.js")
                    model_path = os.path.join(repo_path, "model.asdf")
                true_content = get_content_from_repo(input_pattern)
                noisy_content = get_content_from_repo(input_pattern_noisy)
                true_files, noisy_files, start_changes = get_difflib_changes(
                    true_content, noisy_content)
                if not true_files:
                    raise ValueError(
                        "Noisy repo should count at least one artificial mistake"
                    )
                log.info(
                    "Number of files modified by adding style noise: %d / %d",
                    len(true_files), len(true_content))
                del true_content, noisy_content

                analyzer = FormatModel().load(model_path)
                rules = analyzer[language]
                feature_extractor = FeatureExtractor(
                    language=language,
                    **rules.origin_config["feature_extractor"])
                vnodes_y_true = files2vnodes(true_files, feature_extractor,
                                             client)
                mispreds_noise = files2mispreds(noisy_files, feature_extractor,
                                                rules, client, log)
                diff_mispreds = get_diff_mispreds(mispreds_noise,
                                                  start_changes)
                changes_count = len(start_changes)
                n_rules[repo] = len(rules.rules)
                rules_id = [(i, r.stats.conf)
                            for i, r in enumerate(rules.rules)
                            if r.stats.conf > confidence_threshold
                            and r.stats.support > support_threshold]
                rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True)
                for i in range(len(rules_id)):
                    filtered_mispreds = {
                        k: m
                        for k, m in diff_mispreds.items()
                        if any(r[0] == m.rule for r in rules_id[:i + 1])
                    }
                    style_fixes = get_style_fixes(filtered_mispreds,
                                                  vnodes_y_true, true_files,
                                                  noisy_files,
                                                  feature_extractor)
                    precision, recall, f1_score = compute_metrics(
                        changes_count=changes_count,
                        predictions_count=len(filtered_mispreds),
                        true_positive=len(style_fixes))
                    precisions[repo].append(round(precision, 3))
                    recalls[repo].append(round(recall, 3))
                log.info([m.node.path for m in style_fixes])
                print("recall x:", recalls[repo])
                print("precision y:", precisions[repo])

                # compute some stats and quality metrics for the model's evaluation
                repo_names.append(repo)
                n_mistakes[repo] = len(true_files)
                prec_max_rec[repo] = precisions[repo][-1]
                max_rec[repo] = max(recalls[repo])
                n_rules_filtered[repo] = len(rules_id)
                # compute the confidence and recall limit for the given precision threshold
                for i, (prec,
                        rec) in enumerate(zip(precisions[repo],
                                              recalls[repo])):
                    if prec >= precision_threshold:
                        accepted_rules[repo].append((i, rules_id[i][1], rec))
                last_accepted_rule[repo] = min(accepted_rules[repo],
                                               key=itemgetter(1))
                confidence_threshold_exp[repo] = (last_accepted_rule[repo][0],
                                                  last_accepted_rule[repo][1])
    finally:
        client._channel.close()

    # compute the index of the last accepted rule according to the maximum confidence threshold
    limit_conf_id = {}
    max_confidence_threshold_exp = max(confidence_threshold_exp.values(),
                                       key=itemgetter(1))
    for repo, rules in accepted_rules.items():
        for rule in rules:
            if rule[1] < max_confidence_threshold_exp[1]:
                break
            limit_conf_id[repo] = rule[0]

    # compile the precision-recall curves
    path_to_figure = os.path.join(dir_output, "pr_curves.png")
    plot_curve(repo_names, recalls, precisions, precision_threshold,
               limit_conf_id, path_to_figure)

    # compile the markdown template for the report through jinja2
    loader = jinja2.FileSystemLoader(
        (os.path.join(os.path.dirname(__file__), "..", "templates"), ),
        followlinks=True)
    env = jinja2.Environment(
        trim_blocks=True,
        lstrip_blocks=True,
        keep_trailing_newline=True,
    )
    env.globals.update(range=range)
    template = loader.load(env, "noisy_quality_report.md.jinja2")
    report = template.render(repos=repo_names,
                             n_mistakes=n_mistakes,
                             prec_max_rec=prec_max_rec,
                             confidence_threshold_exp=round(
                                 max_confidence_threshold_exp[1], 2),
                             max_rec=max_rec,
                             confidence_threshold=confidence_threshold,
                             support_threshold=support_threshold,
                             n_rules=n_rules,
                             n_rules_filtered=n_rules_filtered,
                             path_to_figure=path_to_figure)

    # write the quality report
    repo_pathrt = os.path.join(dir_output, "report_noise.md")
    with open(repo_pathrt, "w", encoding="utf-8") as f:
        f.write(report)
Example #17
0
def visualize(input_filename: str, bblfsh: str, language: str,
              model_path: str) -> None:
    """Visualize the errors made on a single file."""
    model = FormatModel().load(model_path)
    rules = model[language]
    print("Model parameters: %s" % rules.origin)
    print("Stats about rules: %s" % rules)

    client = BblfshClient(bblfsh)
    file = prepare_file(input_filename, client, language)

    fe = FeatureExtractor(language=language,
                          **rules.origin_config["feature_extractor"])
    X, y, vnodes_y, vnodes = fe.extract_features([file])

    y_pred, _, _ = rules.predict(X, vnodes_y, vnodes, fe)

    # collect lines with mispredictions - could be removed
    mispred_lines = set()
    lines = set()
    for gt, pred, node in zip(y, y_pred, vnodes_y):
        lines.add((node.path, node.start.line))
        if gt != pred:
            mispred_lines.add((node.path, node.start.line))
    print("Number of lines with mispredictions: %s out of %s mispredicted" %
          (len(mispred_lines), len(lines)))

    # collect mispredictions and all other predictions for each line with mistake
    mispred = defaultdict(list)
    for gt, pred, node in zip(y, y_pred, vnodes_y):
        if (node.path, node.start.line) in mispred_lines:
            mispred[(node.path,
                     node.start.line)].append(Misprediction(gt, pred, node))

    # sort each line
    for value in mispred.values():
        value.sort(key=lambda k: k.node.start.offset)

    # final mispredictions
    final_mispred = []
    for line in sorted(mispred):
        gt = [m.y for m in mispred[line]]
        pred = [m.pred for m in mispred[line]]
        s = SequenceMatcher(None, gt, pred)
        blocks = s.get_matching_blocks()

        if blocks[0].a != 0:
            # mispredictions before the first matching block
            final_mispred.extend(mispred[line][:blocks[0].a])
        for i in range(len(blocks) - 1):
            final_mispred.extend(mispred[line][blocks[i].a:blocks[i + 1].a])
        if blocks[-1].a != len(mispred[line]):
            # mispredictions after the last matching block
            final_mispred.extend(mispred[line][blocks[-1].a:])

    mispred = sorted([misp for misp in final_mispred if misp.y != misp.pred],
                     key=lambda r: r.node.start.offset)

    new_content = ENDC
    old_content = file.content.decode("utf-8")
    for i in range(len(mispred)):
        wrong = mispred[i]
        start = wrong.node.start.offset
        end = wrong.node.end.offset
        if end == start:
            end = start + len(wrong.node.value)

        if i == 0 and start != 0:
            new_content += old_content[:start]

        new_content += GREEN + CLASSES[wrong.y] + RED + CLASSES[
            wrong.pred] + ENDC

        if i == len(mispred) - 1:
            if end != len(old_content):
                new_content += old_content[end:]
        else:
            new_content += old_content[end:mispred[i + 1].node.start.offset]
    print("Visualization:\n" + new_content)
Example #18
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        _log.info("train %s %s %s", ptr.url, ptr.commit,
                  pformat(config, width=4096, compact=True))
        model = FormatModel().construct(cls, ptr)
        config = cls._load_train_config(config)
        for language, files in files_by_language(data["files"]).items():
            try:
                lang_config = config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            files = filter_files(files, lang_config["line_length_limit"], _log)
            submit_event("%s.train.%s.files" % (cls.name, language),
                         len(files))
            if len(files) == 0:
                _log.info(
                    "zero files after filtering, language %s is skipped.",
                    language)
                continue
            try:
                fe = FeatureExtractor(language=language,
                                      **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files),
                             language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path))
            X, selected_features = fe.select_features(X, y)
            lang_config["feature_extractor"][
                "selected_features"] = selected_features
            lang_config["feature_extractor"][
                "label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X.shape[0],
                             lower_bound_instances)
                continue
            _log.debug("training the rules model")
            optimizer = Optimizer(
                n_jobs=lang_config["n_jobs"],
                n_iter=lang_config["n_iter"],
                cv=lang_config["cv"],
                random_state=lang_config["trainable_rules"]["random_state"])
            best_score, best_params = optimizer.optimize(X, y)
            _log.debug("score of the best estimator found: %.6f", best_score)
            _log.debug("params of the best estimator found: %s",
                       str(best_params))
            _log.debug("training the model with complete data")
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             origin_config=lang_config)
            trainable_rules.fit(X, y)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"], "\n\t".join(
                    "%-55s %.5E" % (fe.feature_names[i], importances[i])
                    for i in numpy.argsort(-importances)[:25]
                    if importances[i] > 1e-5))
            submit_event("%s.train.%s.rules" % (cls.name, language),
                         len(trainable_rules.rules))
            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
            # throw away imprecise classes
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipping.", language)
        _log.info("trained %s", model)
        return model