def test_file_filtering(self): self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[]) config = get_config() config["train"]["language_defaults"]["line_length_limit"] = 0 model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service) self.assertEqual(len(model_trained._rules_by_lang), 0) config["train"]["language_defaults"]["line_length_limit"] = 500 model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service) self.assertGreater(len(model_trained._rules_by_lang), 0)
def test_train(self): datastub = FakeDataStub(files=self.base_files.values(), changes=None) config = {"n_iter": 1} model1 = FormatAnalyzer.train(self.ptr, config, datastub) self.assertIsInstance(model1, FormatModel) self.assertIn("javascript", model1, str(model1)) datastub = FakeDataStub(files=self.base_files.values(), changes=None) config = {"n_iter": 1} model2 = FormatAnalyzer.train(self.ptr, config, datastub) self.assertEqual(model1["javascript"].rules, model2["javascript"].rules) self.assertGreater(len(model1["javascript"]), 10)
def test_train_cutoff_labels(self): self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[]) model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) self.assertIsInstance(model1, FormatModel) self.assertIn("javascript", model1, str(model1)) model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) self.assertEqual(model1["javascript"].rules, model2["javascript"].rules) self.assertGreater(len(model1["javascript"]), 5) # Check that model can be saved without problems and then load back with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f: model2.save(f) f.seek(0) model3 = FormatModel().load(f) compare_models(self, model2, model3)
def test_analyze(self): common = self.base_files.keys() & self.head_files.keys() datastub = FakeDataStub(files=self.base_files.values(), changes=[ Change(base=self.base_files[k], head=self.head_files[k]) for k in common ]) config = {"n_iter": 1} model = FormatAnalyzer.train(self.ptr, config, datastub) analyzer = FormatAnalyzer(model, self.ptr.url, {}) comments = analyzer.analyze(self.ptr, self.ptr, datastub) self.assertGreater(len(comments), 0)
def test_analyze(self): common = self.base_files.keys() & self.head_files.keys() self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k]) for k in common]) config = get_config() # Make uast_break_check only here config["analyze"]["language_defaults"]["uast_break_check"] = True model = FormatAnalyzer.train(self.ptr, config, self.data_service) analyzer = FormatAnalyzer(model, self.ptr.url, config) comments = analyzer.analyze(self.ptr, self.ptr, self.data_service) self.assertGreater(len(comments), 0)
def train(training_dir: str, ref: ReferencePointer, output_path: str, language: str, bblfsh: str, config: Optional[Union[str, dict]], log: Optional[logging.Logger] = None) -> FormatModel: """ Train a FormatModel for debugging purposes. :param training_dir: Path to the directory containing the files to train from. :param ref: Reference pointer to repository for training :param output_path: Path to the model to write. :param language: Language to filter on. :param bblfsh: Address of the babelfish server. :param config: Path to a YAML config to use during the training or \ json-like object with a config. :param log: logger used to report during training. :return: Trained FormatNodel. """ bblfsh_client = BblfshClient(bblfsh) if config is not None: if isinstance(config, str): with open(config) as fh: config = safe_load(fh) else: config = {} config = FormatAnalyzer._load_config(config) filepaths = glob.glob(os.path.join(training_dir, "**", "*.js"), recursive=True) model = FormatAnalyzer.train( ref, config, FakeDataService(bblfsh_client=bblfsh_client, files=parse_files(filepaths=filepaths, line_length_limit=config["train"] [language]["line_length_limit"], overall_size_limit=config["train"] [language]["overall_size_limit"], client=bblfsh_client, language=language, log=log), changes=None)) model.save(output_path) return model
def test_train_check(self): common = self.base_files.keys() & self.head_files.keys() self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[Change(base=self.base_files[k], head=self.head_files[k]) for k in common]) model = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) required = FormatAnalyzer.check_training_required( model, self.ptr, get_config(), self.data_service) self.assertFalse(required) self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k]) for k in common]) required = FormatAnalyzer.check_training_required( model, self.ptr, get_config(), self.data_service) self.assertTrue(required)
def train(training_dir: str, output_path: str, language: str, bblfsh: str, config: str ) -> None: """ Train a FormatModel for debugging purposes. :param training_dir: Path to the directory containing the files to train from. :param output_path: Path to the model to write. :param language: Language to filter on. :param bblfsh: Address of the babelfish server. :param config: Path to a YAML config to use during the training. """ bblfsh_client = BblfshClient(bblfsh) if config is not None: with open(config) as fh: config = safe_load(fh) else: config = {} filenames = glob.glob(join(training_dir, "**", "*"), recursive=True) model = FormatAnalyzer.train( ReferencePointer("someurl", "someref", "somecommit"), config, FakeDataService(bblfsh_client, prepare_files(filenames, bblfsh_client, language), None) ) model.save(output_path)