Esempio n. 1
0
 def test_file_filtering(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     config = get_config()
     config["train"]["language_defaults"]["line_length_limit"] = 0
     model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service)
     self.assertEqual(len(model_trained._rules_by_lang), 0)
     config["train"]["language_defaults"]["line_length_limit"] = 500
     model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service)
     self.assertGreater(len(model_trained._rules_by_lang), 0)
Esempio n. 2
0
 def test_train(self):
     dataservice = FakeDataService(self.bblfsh_client,
                                   files=self.base_files,
                                   changes=[])
     model = IdTyposAnalyzer.train(ptr=self.ptr,
                                   config={},
                                   data_service=dataservice)
     self.assertSetEqual(
         model.identifiers,
         {"name", "print_type", "get_length", "customidentifiertostore"})
Esempio n. 3
0
 def test_train_check(self):
     common = self.base_files.keys() & self.head_files.keys()
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=self.base_files[k], head=self.head_files[k])
                  for k in common])
     model = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     required = FormatAnalyzer.check_training_required(
         model, self.ptr, get_config(), self.data_service)
     self.assertFalse(required)
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k])
                  for k in common])
     required = FormatAnalyzer.check_training_required(
         model, self.ptr, get_config(), self.data_service)
     self.assertTrue(required)
Esempio n. 4
0
 def test_analyze(self):
     common = self.base_files.keys() & self.head_files.keys()
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k])
                  for k in common])
     config = get_config()
     # Make uast_break_check only here
     config["analyze"]["language_defaults"]["uast_break_check"] = True
     model = FormatAnalyzer.train(self.ptr, config, self.data_service)
     analyzer = FormatAnalyzer(model, self.ptr.url, config)
     comments = analyzer.analyze(self.ptr, self.ptr, self.data_service)
     self.assertGreater(len(comments), 0)
Esempio n. 5
0
    def test_analyze(self):
        dataservice = FakeDataService(
            self.bblfsh_client,
            files=self.base_files,
            changes=[Change(base=self.base_files[0], head=self.head_files[0])])
        model = IdTyposAnalyzer.train(ptr=self.ptr,
                                      config={},
                                      data_service=dataservice)
        analyzer = IdTyposAnalyzer(model=model,
                                   url=self.ptr.url,
                                   config=dict(model=MODEL_PATH,
                                               confidence_threshold=0.0,
                                               n_candidates=3,
                                               check_all_identifiers=False))
        comments = analyzer.analyze(ptr_from=self.ptr,
                                    ptr_to=self.ptr,
                                    data_service=dataservice)
        self.assertGreater(len(comments), 0)
        bad_names = ["nam", "print_tipe", "gett_lenght"]
        good_names = [
            "name", "print_type", "get_length", "customidentifiertostore"
        ]
        for c in comments:
            self.assertFalse(
                any(name in c.text.split(", fixes:")[0]
                    for name in good_names))
            self.assertTrue(
                any(name in c.text.split(", fixes:")[0] for name in bad_names))

        analyzer = IdTyposAnalyzer(model=model,
                                   url=self.ptr.url,
                                   config=dict(model=MODEL_PATH,
                                               confidence_threshold=0.0,
                                               n_candidates=3,
                                               check_all_identifiers=True))
        comments = analyzer.analyze(ptr_from=self.ptr,
                                    ptr_to=self.ptr,
                                    data_service=dataservice)
        self.assertGreater(len(comments), 0)
        bad_names = [
            "nam", "print_tipe", "gett_lenght", "customidentifiertostore"
        ]
        good_names = ["name", "print_type", "get_length"]
        for c in comments:
            self.assertFalse(
                any(name in c.text.split(", fixes:")[0]
                    for name in good_names))
            self.assertTrue(
                any(name in c.text.split(", fixes:")[0] for name in bad_names))
Esempio n. 6
0
 def test_run(self):
     dataservice = FakeDataService(
         self.bblfsh_client,
         files=self.head_files,
         changes=[Change(base=self.base_files[0], head=self.head_files[0])])
     model = IdTyposAnalyzerSpy.train(ptr=self.ptr,
                                      config={},
                                      data_service=dataservice)
     analyzer = IdTyposAnalyzerSpy(model=model,
                                   url=self.ptr.url,
                                   config=self.config)
     typo_fixes = list(analyzer.run(ptr=self.ptr, data_service=dataservice))
     self.assertGreater(len(typo_fixes), 0)
     for typo_fix in typo_fixes:
         self.check_typo_fix(typo_fix)
Esempio n. 7
0
 def test_train_cutoff_labels(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertIsInstance(model1, FormatModel)
     self.assertIn("javascript", model1, str(model1))
     model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertEqual(model1["javascript"].rules, model2["javascript"].rules)
     self.assertGreater(len(model1["javascript"]), 5)
     # Check that model can be saved without problems and then load back
     with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f:
         model2.save(f)
         f.seek(0)
         model3 = FormatModel().load(f)
         compare_models(self, model2, model3)
Esempio n. 8
0
def train(training_dir: str,
          ref: ReferencePointer,
          output_path: str,
          language: str,
          bblfsh: str,
          config: Optional[Union[str, dict]],
          log: Optional[logging.Logger] = None) -> FormatModel:
    """
    Train a FormatModel for debugging purposes.

    :param training_dir: Path to the directory containing the files to train from.
    :param ref: Reference pointer to repository for training
    :param output_path: Path to the model to write.
    :param language: Language to filter on.
    :param bblfsh: Address of the babelfish server.
    :param config: Path to a YAML config to use during the training or \
                   json-like object with a config.
    :param log: logger used to report during training.
    :return: Trained FormatNodel.
    """
    bblfsh_client = BblfshClient(bblfsh)
    if config is not None:
        if isinstance(config, str):
            with open(config) as fh:
                config = safe_load(fh)
    else:
        config = {}
    config = FormatAnalyzer._load_config(config)
    filepaths = glob.glob(os.path.join(training_dir, "**", "*.js"),
                          recursive=True)
    model = FormatAnalyzer.train(
        ref, config,
        FakeDataService(bblfsh_client=bblfsh_client,
                        files=parse_files(filepaths=filepaths,
                                          line_length_limit=config["train"]
                                          [language]["line_length_limit"],
                                          overall_size_limit=config["train"]
                                          [language]["overall_size_limit"],
                                          client=bblfsh_client,
                                          language=language,
                                          log=log),
                        changes=None))
    model.save(output_path)
    return model
Esempio n. 9
0
 def test_analyze(self):
     dataservice = FakeDataService(
         self.bblfsh_client,
         files=self.head_files,
         changes=[Change(base=self.base_files[0], head=self.head_files[0])])
     model = IdTyposAnalyzerSpy.train(ptr=self.ptr,
                                      config={},
                                      data_service=dataservice)
     analyzer = IdTyposAnalyzerSpy(model=model,
                                   url=self.ptr.url,
                                   config=self.config)
     comments = analyzer.analyze(ptr_from=self.ptr,
                                 ptr_to=self.ptr,
                                 data_service=dataservice)
     self.assertGreater(len(comments), 0)
     for comment in comments:
         self.assertIsInstance(comment, Comment)
         typo_fix_dict = json.loads(comment.text)
         typo_fix_dict["candidates"] = [
             Candidate(identifier, confidence)
             for identifier, confidence in typo_fix_dict["candidates"]
         ]
         typo_fix = TypoFix(**typo_fix_dict)
         self.check_typo_fix(typo_fix)