def test_positions(self): test_js_code_filepath = Path( __file__).parent / "browser-policy-content.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="", language="javascript", contents=code).uast converter = BytesToUnicodeConverter(code) code_uni = converter.convert_content() uast_uni = converter.convert_uast(uast) file = UnicodeFile(content=code_uni, uast=uast_uni, language="javascript", path="test.js") annotated_data = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_data) nodes, _ = file_to_old_parse_file_format(annotated_data) for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])): self.assertLessEqual( node1.start.line, node2.start.line, "Start line position decrease for %d, %d nodes" % (index, index + 1)) self.assertLessEqual( node1.start.offset, node2.start.offset, "Start offset position decrease for %d, %d nodes" % (index, index + 1))
def test_extract_features_all_lines(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file, file] self.check_X_y(*self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") + 1))] * 2))
def generate_local_test(mcs, case_name, uast, contents): fe_config = FormatAnalyzer._load_config( get_config())["train"]["javascript"] feature_extractor = FeatureExtractor(language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) file = UnicodeFile(content=contents, uast=uast, path="", language="") _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file]) offsets, y_pred, result = cases[case_name] def _test(self): y_cur = deepcopy(self.y) for offset, yi in zip(offsets, y_pred): i = None for i, vnode in enumerate(vnodes_y): # noqa: B007 if offset == vnode.start.offset: break y_cur[i] = yi code_generator = CodeGenerator(self.feature_extractor) pred_vnodes = code_generator.apply_predicted_y( self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))), FakeRules(y_cur)) generated_file = code_generator.generate(pred_vnodes) self.assertEqual(generated_file, result) return _test
def setUpClass(cls): config = FormatAnalyzer._load_config( merge_dicts( get_config(), { "train": { "javascript": { "feature_extractor": { "left_siblings_window": 1, "right_siblings_window": 1, "parents_depth": 1, "node_features": ["start_line", "reserved", "roles"], }, }, }, }))["train"] base = Path(__file__).parent with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = UnicodeFile(content=contents, uast=uast, path="", language="") files = [file, file] cls.fe = FeatureExtractor(language="javascript", **config["javascript"]["feature_extractor"]) cls.fe.extract_features(files) cls.class_representations = cls.fe.composite_class_representations cls.n_classes = len(cls.fe.labels_to_class_sequences) cls.ordinal = cls.return_node_feature(FeatureId.start_line) cls.categorical = cls.return_node_feature(FeatureId.reserved) cls.bag = cls.return_node_feature(FeatureId.roles)
def test_multiple_files(self): data = [ ("var a = 0", {1: (CLS_NOOP,)}), ("var b = 123", {4: (CLS_NOOP,)}), ] files = [] for i, (code, _) in enumerate(data): uast, errors = parse_uast(self.stub, code, filename="", language=self.language, unicode=True) if errors: self.fail("Could not parse the testing code.") files.append(UnicodeFile(content=code, uast=uast, path="test_file_%d" % i, language="javascript")) X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features(files) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for (_, modif) in data: for i in modif: y_pred[i] = self._to_label(modif[i]) checker = UASTStabilityChecker(self.fe) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, files, self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions={}) self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
def edit_and_test(self, code: str, modifs: Mapping[int, Sequence[str]], *, quote_indices: Optional[Tuple[int, ...]] = None, bad_indices: Optional[FrozenSet[int]] = None) -> None: uast, errors = parse_uast(self.stub, code, filename="", language=self.language, unicode=True) if errors: self.fail("Could not parse the testing code.") file = UnicodeFile(content=code, uast=uast, path="test_file", language="javascript") X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features([file]) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for index, classes in modifs.items(): y_pred[index] = self._to_label(classes) checker = UASTStabilityChecker(self.fe) grouped_quote_predictions = self._grouped_predictions_mapping(vnodes, quote_indices) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, [file], self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions=grouped_quote_predictions) bad_preds = set(range(y.shape[0])) - set(safe_preds) bad = modifs.keys() if bad_indices is None else bad_indices self.assertEqual(bad_preds, bad) self.assertEqual(len(y) - len(bad), len(new_y)) self.assertEqual(len(y_pred) - len(bad), len(new_y_pred)) self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y)) self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))
def test_extract_features(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file, file] res = self.extractor.extract_features(files) self.assertIsNotNone(res, "Failed to parse files.") self.check_X_y(*res)
def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]: uast = client.parse(filename="", language="javascript", contents=code.encode()).uast extractor = FeatureExtractor(language="javascript", **config) result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="", language="javascript")]) if result is None: self.fail("Could not parse test code.") _, _, (vnodes_y, _, _, _) = result return [vnode.y for vnode in vnodes_y]
def test_extract_features_exact_match(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file] X, y, (vnodes_y, vnodes, _, _) = self.extractor.extract_features(files) self.assertEqual("".join(vnode.value for vnode in vnodes), self.contents)
def return_features() -> Response: """Featurize the given code.""" body = request.get_json() code = body["code"] babelfish_address = body["babelfish_address"] language = body["language"] client = BblfshClient(babelfish_address) res = client.parse(filename="", contents=code.encode(), language=language) if res.status != 0: abort(500) model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf")) if language not in model: raise NotFittedError() rules = model[language] file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path") config = rules.origin_config["feature_extractor"] config["return_sibling_indices"] = True fe = FeatureExtractor(language=language, **config) res = fe.extract_features([file]) if res is None: abort(500) X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict( X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe) refuse_to_predict = y_pred < 0 checker = UASTStabilityChecker(fe) _, _, _, _, safe_preds = checker.check( y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub, vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners, grouped_quote_predictions=grouped_quote_predictions) break_uast = [False] * X.shape[0] for wrong_pred in set(range(X.shape[0])).difference(safe_preds): break_uast[wrong_pred] = True labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)} app.logger.info("returning features of shape %d, %d" % X.shape) app.logger.info("length of rules: %d", len(rules)) return jsonify({ "code": code, "features": _input_matrix_to_descriptions(X, fe), "ground_truths": y.tolist(), "predictions": y_pred.tolist(), "refuse_to_predict": refuse_to_predict.tolist(), "sibling_indices": sibling_indices, "rules": _rules_to_jsonable(rules, fe), "winners": rule_winners.tolist(), "break_uast": break_uast, "feature_names": fe.feature_names, "class_representations": fe.composite_class_representations, "class_printables": fe.composite_class_printables, "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)), "config": _mapping_to_jsonable(rules.origin_config)})
def test_extract_features_some_lines(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file] X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents) = self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") // 2 + 1))] * 2) self.check_X_y(X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents)) X2_csr, y2, (vn2_y, vn2, _, _) = self.extractor.extract_features(files) X1, X2 = X1_csr.toarray(), X2_csr.toarray() self.assertTrue((X1 == X2[:len(X1)]).all()) self.assertTrue((y1 == y2[:len(y1)]).all()) self.assertTrue(vn1_y == vn2_y[:len(vn1_y)]) self.assertLess(len(y1), len(y2))
def setUpClass(cls): base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = UnicodeFile(content=contents, uast=uast, path="", language="javascript") cls.files = [file] config = FormatAnalyzer._load_config(get_config())["train"] cls.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"])
def convert_file(file: File) -> UnicodeFile: """ Convert lookout `File` to `UnicodeFile` with converted content and uast. path and language fields are the same for result and provided `File` instance. :param file: lookout File to convert. :return: New UnicodeFile instance. """ converter = BytesToUnicodeConverter(file.content) return UnicodeFile( content=converter.convert_content(), uast=converter.convert_uast(file.uast), path=file.path, language=file.language, )
def setUpClass(cls): cls.maxDiff = None base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) config = FormatAnalyzer._load_config(get_config()) fe_config = config["train"]["javascript"] cls.feature_extractor = FeatureExtractor( language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) cls.file = UnicodeFile(content=contents, uast=uast, path="", language="") cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \ cls.feature_extractor.extract_features([cls.file])
def test_generate_new_line(self): self.maxDiff = None expected_res = { "nothing changed": [], "remove new line in the end of 4th line": None, "indentation in the beginning": [" import { makeToast } from '../../common/app/Toasts/redux';"], "remove indentation in the 4th line till the end": [" return Object.keys(flash)", " }"], "new line between 6th and 7th regular code lines": ["\n return messages.map(message => ({"], "new line in the middle of the 7th code line with indentation increase": [" return messages\n .map(message => ({", " })"], "new line in the middle of the 7th code line with indentation decrease": [" return messages\n .map(message => ({", " })"], "new line in the middle of the 7th code line without indentation increase": [" return messages\n .map(message => ({"], "change quotes": ['import { makeToast } from "../../common/app/Toasts/redux";'], "remove indentation decrease 11th line": [" }));"], "change indentation decrease to indentation increase 11th line": [" }));"], "change indentation decrease to indentation increase 11th line but keep the rest": [" }));", "})"], } base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) config = FormatAnalyzer._load_config(get_config()) fe_config = config["train"]["javascript"] for case in expected_res: offsets, y_pred, _ = cases[case] feature_extractor = FeatureExtractor( language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) file = UnicodeFile(content=contents, uast=uast, path="", language="") X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = \ feature_extractor.extract_features([file]) y_cur = deepcopy(y) for offset, yi in zip(offsets, y_pred): i = None for i, vnode in enumerate(vnodes_y): # noqa: B007 if offset == vnode.start.offset: break y_cur[i] = yi code_generator = CodeGenerator(feature_extractor) pred_vnodes = code_generator.apply_predicted_y( vnodes, vnodes_y, list(range(len(vnodes_y))), FakeRules(y_cur)) res = [] for gln in FormatAnalyzer._group_line_nodes( y, y_cur, vnodes_y, pred_vnodes, [1] * len(y)): line, (line_y, line_y_pred, line_vnodes_y, line_vnodes, line_rule_winners) = gln new_code_line = code_generator.generate_new_line(line_vnodes) res.append(new_code_line) if expected_res[case] is not None: # None means that we delete some lines. We are not handle this properly now. self.assertEqual(res, expected_res[case], case)