def analyze_files( analyzer_type: Type[FormatAnalyzer], config: dict, model_path: str, language: str, bblfsh_addr: str, input_pattern: str, log: logging.Logger, ) -> List[Comment]: """Run the model, record the fixes for each file and return them.""" class FakePointer: def to_pb(self): return None model = FormatModel().load(model_path) if language not in model: raise NotFittedError() rules = model[language] client = bblfsh.BblfshClient(bblfsh_addr) files = parse_files( filepaths=glob.glob(input_pattern, recursive=True), line_length_limit=rules.origin_config["line_length_limit"], overall_size_limit=rules.origin_config["overall_size_limit"], client=client, language=language, log=log) log.info("Model parameters: %s" % rules.origin_config) log.info("Rules stats: %s" % rules) log.info("Number of files: %s" % (len(files))) return analyzer_type(model, input_pattern, config).analyze(FakePointer(), None, data_service=FakeDataService( client, files, []))
def files2mispreds(filepaths: Iterable[str], feature_extractor: FeatureExtractor, rules: Rules, client: BblfshClient, log: logging.Logger) -> Iterable[Misprediction]: """ Return the model's `Misprediction`-s on a list of files. :param filepaths: List of files to get `Misprediction`-s from. :param feature_extractor: FeatureExtractor to use. :param rules: Rules to use for prediction. :param client: Babelfish client. Babelfish server should be started accordingly. :param log: Logger. :return: List of `Misprediction`-s extracted from a given list of files. """ files = parse_files(filepaths=filepaths, line_length_limit=rules.origin_config["line_length_limit"], overall_size_limit=rules.origin_config["overall_size_limit"], client=client, language=feature_extractor.language) X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = feature_extractor \ .extract_features(files) y_pred, rule_winners, _, grouped_quote_predictions = rules.predict( X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=feature_extractor) y_pred = rules.fill_missing_predictions(y_pred, y) checker = UASTStabilityChecker(feature_extractor=feature_extractor) y, y_pred, vnodes_y, rule_winners, safe_preds = checker.check( y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=list(files), stub=client._stub, vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners, grouped_quote_predictions=grouped_quote_predictions) mispreds = get_mispreds(y, y_pred, vnodes_y, rule_winners) return mispreds
def test_parse_files(self): class Log: def debug(self, *args, **kwargs): nonlocal logged logged = True logged = False with NamedTemporaryFile(prefix="one", suffix=".js") as tmp1, \ NamedTemporaryFile(prefix="two", suffix=".js") as tmp2: tmp1.write(b"hello") tmp1.seek(0) tmp2.write(b"world" * 100) tmp2.seek(0) try: bblfsh_client = BblfshClient("0.0.0.0:9432") filtered = parse_files(filepaths=[tmp1.name, tmp2.name], line_length_limit=80, overall_size_limit=5 << 20, client=bblfsh_client, language="javascript", log=Log()) self.assertEqual(len(filtered), 1) self.assertEqual(filtered[0].content, b"hello") self.assertTrue(logged) finally: bblfsh_client._channel.close()
def files2vnodes(filepaths: Iterable[str], feature_extractor: FeatureExtractor, rules: Rules, client: BblfshClient) -> Iterable[VirtualNode]: """ Return the `VirtualNode`-s extracted from a list of files. :param filepaths: List of files to get `Misprediction`-s and `VirtualNode`-s from. :param feature_extractor: FeatureExtractor to use. :param rules: Rules to use for prediction. :param client: Babelfish client. Babelfish server should be started accordingly. :return: List of `VirtualNode`-s extracted from a given list of files. """ files = parse_files(filepaths=filepaths, line_length_limit=rules.origin_config["line_length_limit"], overall_size_limit=rules.origin_config["overall_size_limit"], client=client, language=feature_extractor.language) _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features(files) return vnodes_y
def test(self): fixes = [] # type: FileFix bblfsh_client = BblfshClient(self.bblfsh_endpoint) basedir = os.path.dirname(__file__) base_files = parse_files( filepaths=[os.path.join(basedir, "find_chrome_base.js")], line_length_limit=500, overall_size_limit=5 << 20, client=bblfsh_client, language="javascript") base_files[0].path = os.path.join(basedir, "find_chrome_head.js") class Runner(FormatAnalyzerSpy): def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_service: DataService, **data) -> List[Comment]: """ We run the analysis on the single pair of files: `find_chrome.js`. """ class FakeStub: def GetFiles(self, *args, **kwargs): return base_files class FakeDataService: def get_data(self): return FakeStub() fixes.extend( self.run(ptr_from, data_service, FakeDataService())) return [] log = logging.getLogger(type(self).__name__) analyze_files( Runner, {}, os.path.join(basedir, "style.format.analyzer.FormatAnalyzer_1.asdf"), "javascript", self.bblfsh_endpoint, os.path.join(basedir, "find_chrome_head.js"), log) self.assertEqual(len(fixes), 1) self.assertEqual(len(fixes[0].line_fixes), 1) fix = fixes[0].line_fixes[0] # type: LineFix self.assertEqual(fix.line_number, 22) self.assertEqual( fix.suggested_code, "const execFileSync = require('child_process').execFileSync;")
def train(training_dir: str, ref: ReferencePointer, output_path: str, language: str, bblfsh: str, config: Optional[Union[str, dict]], log: Optional[logging.Logger] = None) -> FormatModel: """ Train a FormatModel for debugging purposes. :param training_dir: Path to the directory containing the files to train from. :param ref: Reference pointer to repository for training :param output_path: Path to the model to write. :param language: Language to filter on. :param bblfsh: Address of the babelfish server. :param config: Path to a YAML config to use during the training or \ json-like object with a config. :param log: logger used to report during training. :return: Trained FormatNodel. """ bblfsh_client = BblfshClient(bblfsh) if config is not None: if isinstance(config, str): with open(config) as fh: config = safe_load(fh) else: config = {} config = FormatAnalyzer._load_config(config) filepaths = glob.glob(os.path.join(training_dir, "**", "*.js"), recursive=True) model = FormatAnalyzer.train( ref, config, FakeDataService(bblfsh_client=bblfsh_client, files=parse_files(filepaths=filepaths, line_length_limit=config["train"] [language]["line_length_limit"], overall_size_limit=config["train"] [language]["overall_size_limit"], client=bblfsh_client, language=language, log=log), changes=None)) model.save(output_path) return model