Exemple #1
0
 def test_file_filtering(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     config = get_config()
     config["train"]["language_defaults"]["line_length_limit"] = 0
     model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service)
     self.assertEqual(len(model_trained._rules_by_lang), 0)
     config["train"]["language_defaults"]["line_length_limit"] = 500
     model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service)
     self.assertGreater(len(model_trained._rules_by_lang), 0)
 def test_train(self):
     datastub = FakeDataStub(files=self.base_files.values(), changes=None)
     config = {"n_iter": 1}
     model1 = FormatAnalyzer.train(self.ptr, config, datastub)
     self.assertIsInstance(model1, FormatModel)
     self.assertIn("javascript", model1, str(model1))
     datastub = FakeDataStub(files=self.base_files.values(), changes=None)
     config = {"n_iter": 1}
     model2 = FormatAnalyzer.train(self.ptr, config, datastub)
     self.assertEqual(model1["javascript"].rules,
                      model2["javascript"].rules)
     self.assertGreater(len(model1["javascript"]), 10)
Exemple #3
0
 def test_train_cutoff_labels(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertIsInstance(model1, FormatModel)
     self.assertIn("javascript", model1, str(model1))
     model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertEqual(model1["javascript"].rules, model2["javascript"].rules)
     self.assertGreater(len(model1["javascript"]), 5)
     # Check that model can be saved without problems and then load back
     with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f:
         model2.save(f)
         f.seek(0)
         model3 = FormatModel().load(f)
         compare_models(self, model2, model3)
 def test_analyze(self):
     common = self.base_files.keys() & self.head_files.keys()
     datastub = FakeDataStub(files=self.base_files.values(),
                             changes=[
                                 Change(base=self.base_files[k],
                                        head=self.head_files[k])
                                 for k in common
                             ])
     config = {"n_iter": 1}
     model = FormatAnalyzer.train(self.ptr, config, datastub)
     analyzer = FormatAnalyzer(model, self.ptr.url, {})
     comments = analyzer.analyze(self.ptr, self.ptr, datastub)
     self.assertGreater(len(comments), 0)
Exemple #5
0
 def test_analyze(self):
     common = self.base_files.keys() & self.head_files.keys()
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k])
                  for k in common])
     config = get_config()
     # Make uast_break_check only here
     config["analyze"]["language_defaults"]["uast_break_check"] = True
     model = FormatAnalyzer.train(self.ptr, config, self.data_service)
     analyzer = FormatAnalyzer(model, self.ptr.url, config)
     comments = analyzer.analyze(self.ptr, self.ptr, self.data_service)
     self.assertGreater(len(comments), 0)
Exemple #6
0
def train(training_dir: str,
          ref: ReferencePointer,
          output_path: str,
          language: str,
          bblfsh: str,
          config: Optional[Union[str, dict]],
          log: Optional[logging.Logger] = None) -> FormatModel:
    """
    Train a FormatModel for debugging purposes.

    :param training_dir: Path to the directory containing the files to train from.
    :param ref: Reference pointer to repository for training
    :param output_path: Path to the model to write.
    :param language: Language to filter on.
    :param bblfsh: Address of the babelfish server.
    :param config: Path to a YAML config to use during the training or \
                   json-like object with a config.
    :param log: logger used to report during training.
    :return: Trained FormatNodel.
    """
    bblfsh_client = BblfshClient(bblfsh)
    if config is not None:
        if isinstance(config, str):
            with open(config) as fh:
                config = safe_load(fh)
    else:
        config = {}
    config = FormatAnalyzer._load_config(config)
    filepaths = glob.glob(os.path.join(training_dir, "**", "*.js"),
                          recursive=True)
    model = FormatAnalyzer.train(
        ref, config,
        FakeDataService(bblfsh_client=bblfsh_client,
                        files=parse_files(filepaths=filepaths,
                                          line_length_limit=config["train"]
                                          [language]["line_length_limit"],
                                          overall_size_limit=config["train"]
                                          [language]["overall_size_limit"],
                                          client=bblfsh_client,
                                          language=language,
                                          log=log),
                        changes=None))
    model.save(output_path)
    return model
Exemple #7
0
 def test_train_check(self):
     common = self.base_files.keys() & self.head_files.keys()
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=self.base_files[k], head=self.head_files[k])
                  for k in common])
     model = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     required = FormatAnalyzer.check_training_required(
         model, self.ptr, get_config(), self.data_service)
     self.assertFalse(required)
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k])
                  for k in common])
     required = FormatAnalyzer.check_training_required(
         model, self.ptr, get_config(), self.data_service)
     self.assertTrue(required)
def train(training_dir: str, output_path: str, language: str, bblfsh: str, config: str
          ) -> None:
    """
    Train a FormatModel for debugging purposes.

    :param training_dir: Path to the directory containing the files to train from.
    :param output_path: Path to the model to write.
    :param language: Language to filter on.
    :param bblfsh: Address of the babelfish server.
    :param config: Path to a YAML config to use during the training.
    """
    bblfsh_client = BblfshClient(bblfsh)
    if config is not None:
        with open(config) as fh:
            config = safe_load(fh)
    else:
        config = {}
    filenames = glob.glob(join(training_dir, "**", "*"), recursive=True)
    model = FormatAnalyzer.train(
        ReferencePointer("someurl", "someref", "somecommit"),
        config,
        FakeDataService(bblfsh_client, prepare_files(filenames, bblfsh_client, language), None)
    )
    model.save(output_path)