def test_with_changed_uasts_rpc_error(self): called = False def func(imposter, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_service: DataService, **data): nonlocal called called = True def fail(f): def wrapped(): f() self.assertIsNotNone( self.data_service._data_request_local.channel) raise grpc.RpcError() return wrapped self.data_service._get_channel = fail(self.data_service._get_channel) func = with_changed_uasts(unicode=False)(func) self.assertRaises( grpc.RpcError, func, self, ReferencePointer(self.url, self.ref, self.COMMIT_FROM), ReferencePointer(self.url, self.ref, self.COMMIT_TO), self.data_service) self.assertFalse(called) self.assertIsNone(self.data_service._data_request_local.channel)
def process_review_event(self, request: ReviewEvent) -> EventResponse: base_ptr = ReferencePointer.from_pb(request.commit_revision.base) head_ptr = ReferencePointer.from_pb(request.commit_revision.head) response = EventResponse() response.analyzer_version = self.version comments = [] for analyzer in self._analyzers: try: mycfg = dict(request.configuration[analyzer.__name__]) except (KeyError, ValueError): mycfg = {} model, cache_miss = self._model_repository.get( self._model_id(analyzer), analyzer.model_type, base_ptr.url) if cache_miss: self._log.info("cache miss: %s", analyzer.__name__) if model is None: self._log.info("training: %s", analyzer.__name__) model = analyzer.train(base_ptr, mycfg, self._data_service.get()) self._model_repository.set(self._model_id(analyzer), base_ptr.url, model) self._log.debug("running %s", analyzer.__name__) results = analyzer(model, head_ptr.url, mycfg).analyze( base_ptr, head_ptr, self._data_service.get()) self._log.info("%s: %d comments", analyzer.__name__, len(results)) comments.extend(results) response.comments.extend(comments) return response
def test_with_changed_uasts_and_contents(self): def func(imposter, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_request_stub: DataStub, **data): changes = list(data["changes"]) self.assertEqual(len(changes), 1) change = changes[0] self.assertEqual(len(change.base.content), 5548) self.assertEqual(len(change.head.content), 5542) self.assertEqual( type(change.base.uast).__module__, bblfsh.Node.__module__) self.assertEqual( type(change.head.uast).__module__, bblfsh.Node.__module__) self.assertEqual(change.base.path, change.head.path) self.assertEqual(change.base.path, "lookout/core/manager.py") self.assertEqual(change.base.language, "Python") self.assertEqual(change.head.language, "Python") func = with_changed_uasts_and_contents(func) func( self, ReferencePointer(self.url, self.ref, "4984b98b0e2375e9372fbab4eb4c9cd8f0c289c6"), ReferencePointer(self.url, self.ref, "5833b4ba94154cf1ed07f37c32928c7b4411b36b"), self.data_service.get())
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") with lzma.open(str(base / "test_base_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_base_file.js", contents=contents).uast cls.base_files = [ FakeFile(path="test_base_file.js", content=contents, uast=uast, language="Javascript") ] with lzma.open(str(base / "test_head_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_head_file.js", contents=contents).uast cls.head_files = [ FakeFile(path="test_head_file.js", content=contents, uast=uast, language="Javascript") ] cls.ptr = ReferencePointer("someurl", "someref", "somecommit")
def process_push_event(self, request: PushEvent) -> EventResponse: # noqa: D401 """ Callback for push events invoked by EventListener. """ ptr = ReferencePointer.from_pb(request.commit_revision.head) data_service = self._data_service for analyzer in self._analyzers: if analyzer.model_type == DummyAnalyzerModel: continue try: mycfg = self._protobuf_struct_to_dict( request.configuration[analyzer.name]) except (KeyError, ValueError): mycfg = {} model = self._get_model(analyzer, ptr.url) if model is not None: must_train = analyzer.check_training_required( model, ptr, mycfg, data_service) if not must_train: self._log.info("skipped training %s", analyzer.name) continue self._log.debug("training %s", analyzer.name) record_event("%s.train" % analyzer.name, 1) model = analyzer.train(ptr, mycfg, data_service) self._model_repository.set(self._model_id(analyzer), ptr.url, model) response = EventResponse() response.analyzer_version = self.version return response
def request_changes(stub: DataStub, ptr_from: ReferencePointer, ptr_to: ReferencePointer, contents: bool, uast: bool, unicode: bool) -> Iterator[Change]: """ Invoke GRPC API and get the changes. Used by `with_changed_uasts()` and Review events. :return: The stream of the gRPC invocation results. In theory, `.result()` would turn this \ into a synchronous call, but in practice, that function call hangs for some reason. """ request = ChangesRequest(base=ptr_from.to_pb(), head=ptr_to.to_pb()) request.exclude_pattern = GARBAGE_PATTERN request.exclude_vendored = True request.want_contents = contents request.want_language = contents or uast request.want_uast = uast changes = stub.GetChanges(request) if unicode: changes = map(BytesToUnicodeConverter.convert_change, changes) return changes
def test_with_changed_contents(self): def func(imposter, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_service: DataService, **data): self.assertIsInstance(data_service, DataService) changes = list(data["changes"]) self.assertEqual(len(changes), 1) change = changes[0] self.assertEqual(len(change.base.content), 5548) self.assertEqual(len(change.head.content), 5542) self.assertFalse(change.base.uast.children) self.assertFalse(change.head.uast.children) self.assertEqual(change.base.path, change.head.path) self.assertEqual(change.base.path, "lookout/core/manager.py") self.assertEqual(change.base.language, "Python") self.assertEqual(change.head.language, "Python") func = with_changed_contents(unicode=False)(func) func(self, ReferencePointer(self.url, self.ref, self.COMMIT_FROM), ReferencePointer(self.url, self.ref, self.COMMIT_TO), self.data_service)
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("FormatAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.uast.xz")) as fin: cls.uast = bblfsh.Node.FromString(fin.read()) cls.base_files = cls.get_files_from_tar(str(base / "freecodecamp-base.tar.xz")) cls.head_files = cls.get_files_from_tar(str(base / "freecodecamp-head.tar.xz")) cls.ptr = ReferencePointer("someurl", "someref", "somecommit") FeatureExtractor._log.level = logging.DEBUG cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432")
def test_with_changed_uasts_unicode(self): def func(imposter, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_service: DataService, **data): self.assertIsInstance(data_service, DataService) changes = list(data["changes"]) self.assertEqual(len(changes), 1) change = changes[0] self.assertEqual(change.base.content, "") self.assertEqual(change.head.content, "") self.assertEqual( type(change.base.uast).__module__, bblfsh.Node.__module__) self.assertEqual( type(change.head.uast).__module__, bblfsh.Node.__module__) self.assertEqual(change.base.path, change.head.path) self.assertEqual(change.base.path, "lookout/core/manager.py") self.assertEqual(change.base.language, "Python") self.assertEqual(change.head.language, "Python") func = with_changed_uasts(unicode=True)(func) func(self, ReferencePointer(self.url, self.ref, self.COMMIT_FROM), ReferencePointer(self.url, self.ref, self.COMMIT_TO), self.data_service)
def process_push_event(self, request: PushEvent) -> EventResponse: ptr = ReferencePointer.from_pb(request.commit_revision.head) for analyzer in self._analyzers: self._log.debug("training %s", analyzer.__name__) try: mycfg = dict(request.configuration[analyzer.__name__]) except (KeyError, ValueError): mycfg = {} model = analyzer.train(ptr, mycfg, self._data_service.get()) self._model_repository.set(self._model_id(analyzer), ptr.url, model) response = EventResponse() response.analyzer_version = self.version return response
def process_review_event( self, request: ReviewEvent) -> EventResponse: # noqa: D401 """ Callback for review events invoked by EventListener. """ base_ptr = ReferencePointer.from_pb(request.commit_revision.base) head_ptr = ReferencePointer.from_pb(request.commit_revision.head) response = EventResponse() response.analyzer_version = self.version comments = [] for analyzer in self._analyzers: try: mycfg = self._protobuf_struct_to_dict( request.configuration[analyzer.name]) self._log.info("%s config: %s", analyzer.name, mycfg) except (KeyError, ValueError): mycfg = {} self._log.debug("no config was provided for %s", analyzer.name) if analyzer.model_type != DummyAnalyzerModel: model = self._get_model(analyzer, base_ptr.url) if model is None: self._log.info("training: %s", analyzer.name) record_event("%s.train" % analyzer.name, 1) model = analyzer.train(base_ptr, mycfg, self._data_service) self._model_repository.set(self._model_id(analyzer), base_ptr.url, model) else: model = DummyAnalyzerModel() self._log.debug("running %s", analyzer.name) record_event("%s.analyze" % analyzer.name, 1) results = analyzer(model, head_ptr.url, mycfg).analyze(base_ptr, head_ptr, self._data_service) self._log.info("%s: %d comments", analyzer.name, len(results)) record_event("%s.comments" % analyzer.name, len(results)) comments.extend(results) response.comments.extend(comments) return response
def request_files(stub: DataStub, ptr: ReferencePointer, contents: bool, uast: bool, unicode: bool) -> Iterator[File]: """ Invoke GRPC API and get the files. Used by `with_uasts()` and Push events. :return: The stream of the gRPC invocation results. """ request = FilesRequest(revision=ptr.to_pb()) request.exclude_pattern = GARBAGE_PATTERN request.exclude_vendored = True request.want_contents = contents request.want_language = contents or uast request.want_uast = uast files = stub.GetFiles(request) if unicode: files = map(BytesToUnicodeConverter.convert_file, files) return files
def test_dummy_model(self): ptr = ReferencePointer("1", "2", "3") model = DummyAnalyzerModel.generate(FakeAnalyzer, ptr) self.assertEqual(model.name, FakeAnalyzer.name) self.assertEqual(model.version, [FakeAnalyzer.version]) self.assertEqual(model.ptr, ptr) self.assertEqual(model.vendor, "source{d}") self.assertEqual(model.description, "Model bound to fake Lookout analyzer.") buffer = io.BytesIO() model.save(buffer) buffer.seek(0) model2 = model.load(buffer) self.assertEqual(model.ptr, model2.ptr) self.assertEqual(model.name, model2.name) self.assertEqual(model.description, model2.description) self.assertEqual(model.vendor, model2.vendor)
def test_with_uasts(self): def func(imposter, ptr: ReferencePointer, config: dict, data_service: DataService, **data): self.assertIsInstance(data_service, DataService) files = list(data["files"]) self.assertEqual(len(files), 18) for file in files: self.assertEqual(file.content, b"") self.assertEqual( type(file.uast).__module__, bblfsh.Node.__module__) self.assertTrue(file.path) self.assertIn(file.language, ("Python", "YAML", "Dockerfile", "Markdown", "Jupyter Notebook", "Shell", "Text", "")) func = with_uasts(unicode=False)(func) func(self, ReferencePointer(self.url, self.ref, self.COMMIT_TO), None, self.data_service)
def test_with_uasts(self): def func(imposter, ptr: ReferencePointer, config: dict, data_request_stub: DataStub, **data): files = list(data["files"]) self.assertEqual(len(files), 61) for file in files: self.assertEqual(file.content, b"") self.assertEqual( type(file.uast).__module__, bblfsh.Node.__module__) self.assertTrue(file.path) self.assertIn(file.language, ("Python", "YAML", "Dockerfile", "Markdown", "Jupyter Notebook", "Shell", "Text", "")) func = with_uasts(func) func( self, ReferencePointer(self.url, self.ref, "5833b4ba94154cf1ed07f37c32928c7b4411b36b"), None, self.data_service.get())
def test_with_contents(self): def func(imposter, ptr: ReferencePointer, config: dict, data_service: DataService, **data): self.assertIsInstance(data_service, DataService) files = list(data["files"]) self.assertEqual(len(files), 18) non_empty_langs = 0 for file in files: if not file.path.endswith("__init__.py"): self.assertGreater(len(file.content), 0, file.path) self.assertFalse(file.uast.children) self.assertTrue(file.path) if file.language: non_empty_langs += 1 self.assertIn(file.language, ("Python", "YAML", "Dockerfile", "Markdown", "Jupyter Notebook", "Shell", "Text", "")) self.assertGreater(non_empty_langs, 0) func = with_contents(unicode=False)(func) func(self, ReferencePointer(self.url, self.ref, self.COMMIT_TO), None, self.data_service)
def main(): setup("DEBUG", False) parser = ArgumentParser() parser.add_argument( "training_dir", help="Path to the directory containing the files to train from.") parser.add_argument("output_path", help="Path to the model to write.") parser.add_argument("--bblfsh", default="0.0.0.0:9432", help="Address of babelfish server.") parser.add_argument("--language", default="javascript", help="Language to filter on.") parser.add_argument( "--config", help="Path to a YAML file containing config to apply during training.") args = parser.parse_args() kwargs = vars(args) kwargs["ref"] = ReferencePointer(kwargs["training_dir"], "HEAD", "<unknown>") train(**kwargs)
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") with lzma.open(str(base / "test_base_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_base_file.js", contents=contents).uast cls.base_files = [ File(path="test_file.js", content=contents, uast=uast, language="Javascript") ] with lzma.open(str(base / "test_head_file.js.xz")) as fin: contents = b"var print_tipe = 0;\n" + fin.read() uast = cls.bblfsh_client.parse("test_head_file.js", contents=contents).uast cls.head_files = [ File(path="test_file.js", content=contents, uast=uast, language="Javascript") ] cls.ptr = ReferencePointer("someurl", "someref", "somecommit") cls.config = { "model": MODEL_PATH, "confidence_threshold": 0.0, "n_candidates": 3, "check_all_identifiers": True, "analyze": { "filepath": cls.base_files[0].path, "wrong_id": "print_tipe", "line": 0 } }
def train(training_dir: str, output_path: str, language: str, bblfsh: str, config: str ) -> None: """ Train a FormatModel for debugging purposes. :param training_dir: Path to the directory containing the files to train from. :param output_path: Path to the model to write. :param language: Language to filter on. :param bblfsh: Address of the babelfish server. :param config: Path to a YAML config to use during the training. """ bblfsh_client = BblfshClient(bblfsh) if config is not None: with open(config) as fh: config = safe_load(fh) else: config = {} filenames = glob.glob(join(training_dir, "**", "*"), recursive=True) model = FormatAnalyzer.train( ReferencePointer("someurl", "someref", "somecommit"), config, FakeDataService(bblfsh_client, prepare_files(filenames, bblfsh_client, language), None) ) model.save(output_path)
def quality_report_noisy(bblfsh: str, language: str, confidence_threshold: float, support_threshold: int, precision_threshold: float, dir_output: str, config: Optional[dict] = None, repos: Optional[str] = None) -> None: """ Generate a quality report on the artificial noisy dataset including evaluation curves. :param bblfsh: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider, others will be discarded. :param confidence_threshold: Confidence threshold to filter relevant rules. :param support_threshold: Support threshold to filter relevant rules. :param precision_threshold: Precision threshold tolerated by the model. \ Limit drawn as a red horizontal line on the figure. :param dir_output: Path to the output directory where to store the quality report in Markdown \ and the precision-recall curve in png format. :param config: FormatAnalyzer config to use. Default one is used if not set. :param repos: Input list of urls to the repositories to analyze. \ Should be strings separated by newlines. If it is None, \ we use the string defined at the beginning of the file. """ log = logging.getLogger("quality_report_noisy") # initialization repo_names = [] last_accepted_rule = {} prediction_rates, precisions, accepted_rules = (defaultdict(list) for _ in range(3)) n_mistakes, prec_max_prediction_rate, confidence_threshold_exp, max_prediction_rate, \ n_rules, n_rules_filtered = ({} for _ in range(6)) if repos is None: repos = REPOSITORIES try: # fetch the the original and noisy repositories client = BblfshClient(bblfsh) log.info("Repositories: %s", repos) with tempfile.TemporaryDirectory() as tmpdirname: for raw in repos.splitlines(): repo_path, clean_commit, noisy_commit = raw.split(",") repo = repo_path.split("/")[-1] log.info("Fetching %s", repo_path) git_dir = os.path.join(tmpdirname, repo) git_dir_noisy = os.path.join(tmpdirname, repo + "_noisy") cmd1 = "git clone --single-branch --branch master %s %s" % ( repo_path, git_dir) cmd2 = "git clone --single-branch --branch style-noise-1-per-file %s %s" \ % (repo_path, git_dir_noisy) try: for cmd in (cmd1, cmd2): log.debug("Running: %s", cmd) subprocess.check_call(cmd.split()) except subprocess.CalledProcessError as e: raise ConnectionError("Unable to fetch repository %s" % repo_path) from e # train the model on the original repository ref = ReferencePointer(repo_path, "HEAD", clean_commit) model_path = os.path.join(git_dir, "model.asdf") format_model = train(training_dir=git_dir, ref=ref, output_path=model_path, language=language, bblfsh=bblfsh, config=config, log=log) rules = format_model[language] # extract the raw data and the diff from the repositories input_pattern = os.path.join(git_dir, "**", "*.js") input_pattern_noisy = os.path.join(git_dir_noisy, "**", "*.js") true_content = get_content_from_repo(input_pattern) noisy_content = get_content_from_repo(input_pattern_noisy) true_files, noisy_files, start_changes = get_difflib_changes( true_content, noisy_content) if not true_files: raise ValueError( "Noisy repo should count at least one artificial mistake" ) log.info( "Number of files modified by adding style noise: %d / %d", len(true_files), len(true_content)) del true_content, noisy_content # extract the features feature_extractor = FeatureExtractor( language=language, **rules.origin_config["feature_extractor"]) vnodes_y_true = files2vnodes(true_files, feature_extractor, rules, client) mispreds_noise = files2mispreds(noisy_files, feature_extractor, rules, client, log) # compute the prediction rate and precision score on the artificial noisy dataset diff_mispreds = get_diff_mispreds(mispreds_noise, start_changes) changes_count = len(start_changes) n_rules[repo] = len(rules.rules) rules_id = [(i, r.stats.conf) for i, r in enumerate(rules.rules) if r.stats.conf > confidence_threshold and r.stats.support > support_threshold] rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True) for i in range(len(rules_id)): filtered_mispreds = { k: m for k, m in diff_mispreds.items() if any(r[0] == m.rule for r in rules_id[:i + 1]) } style_fixes = get_style_fixes(filtered_mispreds, vnodes_y_true, true_files, noisy_files, feature_extractor) prediction_rate, precision = compute_metrics( changes_count=changes_count, predictions_count=len(filtered_mispreds), true_positive=len(style_fixes)) prediction_rates[repo].append(round(prediction_rate, 3)) precisions[repo].append(round(precision, 3)) print("prediction rate x:", prediction_rates[repo]) print("precision y:", precisions[repo]) # compute other statistics and quality metrics for the model's evaluation repo_names.append(repo) n_mistakes[repo] = len(true_files) prec_max_prediction_rate[repo] = precisions[repo][-1] max_prediction_rate[repo] = max(prediction_rates[repo]) n_rules_filtered[repo] = len(rules_id) # compute the confidence and prediction rate limit for a given precision threshold for i, (prediction_rate, prec) in enumerate( zip(prediction_rates[repo], precisions[repo])): if prec >= precision_threshold: accepted_rules[repo].append( (i, rules_id[i][1], prediction_rate)) last_accepted_rule[repo] = min(accepted_rules[repo], key=itemgetter(1)) confidence_threshold_exp[repo] = (last_accepted_rule[repo][0], last_accepted_rule[repo][1]) finally: client._channel.close() # compute the index of the last accepted rule according to the maximum confidence threshold limit_conf_id = {} max_confidence_threshold_exp = max(confidence_threshold_exp.values(), key=itemgetter(1)) for repo, rules in accepted_rules.items(): for rule in rules: if rule[1] < max_confidence_threshold_exp[1]: break limit_conf_id[repo] = rule[0] # compile the curves showing the evolutions of the prediction rate and precision score path_to_figure = os.path.join(dir_output, "pr_curves.png") plot_curve(repo_names, prediction_rates, precisions, precision_threshold, limit_conf_id, path_to_figure) # compile the markdown template for the report through jinja2 loader = jinja2.FileSystemLoader( (os.path.join(os.path.dirname(__file__), "..", "templates"), ), followlinks=True) env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True, keep_trailing_newline=True) env.globals.update(range=range) template = loader.load(env, "noisy_quality_report.md.jinja2") report = template.render(repos=repo_names, n_mistakes=n_mistakes, prec_max_prediction_rate=prec_max_prediction_rate, confidence_threshold_exp=round( max_confidence_threshold_exp[1], 2), max_prediction_rate=max_prediction_rate, confidence_threshold=confidence_threshold, support_threshold=support_threshold, n_rules=n_rules, n_rules_filtered=n_rules_filtered, path_to_figure=path_to_figure) # write the quality report repo_pathrt = os.path.join(dir_output, "report_noise.md") with open(repo_pathrt, "w", encoding="utf-8") as f: f.write(report)