def test_babelfish_unicode(self): content = b"console.log('\xc3\x80');" uast_uni, errors_uni = parse_uast(self.data_service.get_bblfsh(), content.decode(), "test.js", unicode=True) uast, errors = parse_uast(self.data_service.get_bblfsh(), content.decode(), "test.js", unicode=False) self.assertIsInstance(uast, bblfsh.Node) self.assertIsInstance(uast_uni, bblfsh.Node) self.assertEqual(errors_uni, errors) check_uast_transformation(self, content, uast, uast_uni)
def test_babelfish(self): uast, errors = parse_uast(self.data_service.get_bblfsh(), "console.log('hi');", "hi.js", unicode=False) self.assertIsInstance(uast, bblfsh.Node) self.assertEqual(len(errors), 0, str(errors))
def test_multiple_files(self): data = [ ("var a = 0", {1: (CLS_NOOP,)}), ("var b = 123", {4: (CLS_NOOP,)}), ] files = [] for i, (code, _) in enumerate(data): uast, errors = parse_uast(self.stub, code, filename="", language=self.language, unicode=True) if errors: self.fail("Could not parse the testing code.") files.append(UnicodeFile(content=code, uast=uast, path="test_file_%d" % i, language="javascript")) X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features(files) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for (_, modif) in data: for i in modif: y_pred[i] = self._to_label(modif[i]) checker = UASTStabilityChecker(self.fe) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, files, self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions={}) self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
def edit_and_test(self, code: str, modifs: Mapping[int, Sequence[str]], *, quote_indices: Optional[Tuple[int, ...]] = None, bad_indices: Optional[FrozenSet[int]] = None) -> None: uast, errors = parse_uast(self.stub, code, filename="", language=self.language, unicode=True) if errors: self.fail("Could not parse the testing code.") file = UnicodeFile(content=code, uast=uast, path="test_file", language="javascript") X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features([file]) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for index, classes in modifs.items(): y_pred[index] = self._to_label(classes) checker = UASTStabilityChecker(self.fe) grouped_quote_predictions = self._grouped_predictions_mapping(vnodes, quote_indices) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, [file], self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions=grouped_quote_predictions) bad_preds = set(range(y.shape[0])) - set(safe_preds) bad = modifs.keys() if bad_indices is None else bad_indices self.assertEqual(bad_preds, bad) self.assertEqual(len(y) - len(bad), len(new_y)) self.assertEqual(len(y_pred) - len(bad), len(new_y_pred)) self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y)) self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))
def _parse_code( parent: bblfsh.Node, content: str, stub: "bblfsh.aliases.ProtocolServiceStub", parsing_cache: MutableMapping[int, Optional[Tuple[bblfsh.Node, int, int]]], language: str, node_parents: Mapping[int, bblfsh.Node], path: str, ) -> Optional[Tuple[bblfsh.Node, int, int]]: """ Find a parent node that Babelfish can parse and parse it. Iterates over the parents of the current virtual node until it is parseable and returns the parsed UAST or None if it reaches the root without finding a parseable parent. The cache will be used to avoid recomputations for parents that have already been considered. :param parent: First virtual node to try to parse. Will go up in the tree if it fails. :param content: Content of the file. :param stub: Babelfish GRPC service stub. :param parsing_cache: Cache to avoid the recomputation of the results for already seen nodes. :param language: language to use for Babelfish. :param node_parents: Parents mapping of the input UASTs. :param path: Path of the file being parsed. :return: Optional tuple of the parsed UAST and the corresponding starting and ending offsets. """ descendants = [] current_ancestor = parent while current_ancestor is not None: if id(current_ancestor) in parsing_cache: result = parsing_cache[id(current_ancestor)] break descendants.append(current_ancestor) start, end = (current_ancestor.start_position.offset, current_ancestor.end_position.offset) uast, errors = parse_uast(stub, content[start:end], filename="", language=language) if not errors: result = uast, start, end break current_ancestor = node_parents.get(id(current_ancestor), None) else: result = None _log.warning( "skipped file %s, due to errors in parsing the whole content", path) for descendant in descendants: parsing_cache[id(descendant)] = result return result
def _parse_code(self, vnode: VirtualNode, parent: bblfsh.Node, content: str, stub: "bblfsh.aliases.ProtocolServiceStub", node_parents: Mapping[int, bblfsh.Node], path: str, ) -> Optional[Tuple[bblfsh.Node, int, int]]: """ Find a parent node that Babelfish can parse and parse it. Iterates over the parents of the current virtual node until it is parsable and returns the parsed UAST or None if it reaches the root without finding a parsable parent. The cache will be used to avoid recomputations for parents that have already been considered. :param vnode: Vnode that is modified. Used to check that we retrieve the correct parent. :param parent: First virtual node to try to parse. Will go up in the tree if it fails. :param content: Content of the file. :param stub: Babelfish GRPC service stub. :param node_parents: Parents mapping of the input UASTs. :param path: Path of the file being parsed. :return: tuple of the parsed UAST and the corresponding starting and ending offsets. \ None if Babelfish failed to parse the whole file. """ descendants = [] current_ancestor = parent while current_ancestor is not None: if id(current_ancestor) in self._parsing_cache: result = self._parsing_cache[id(current_ancestor)] break descendants.append(current_ancestor) start, end = (current_ancestor.start_position.offset, current_ancestor.end_position.offset) if start <= vnode.start.offset and end > vnode.end.offset: uast, errors = parse_uast(stub, content[start:end], filename="", unicode=True, language=self._feature_extractor.language) if not errors: result = uast, start, end break current_ancestor = node_parents.get(id(current_ancestor), None) else: result = None self._log.warning("skipped file %s, due to errors in parsing the whole content", path) for descendant in descendants: self._parsing_cache[id(descendant)] = result return result
def _check_file( self, y: numpy.ndarray, y_pred: numpy.ndarray, vnodes_y: Sequence[VirtualNode], vnodes: Sequence[VirtualNode], file: File, stub: "bblfsh.aliases.ProtocolServiceStub", vnode_parents: Mapping[int, bblfsh.Node], node_parents: Mapping[int, bblfsh.Node], rule_winners: numpy.ndarray, grouped_quote_predictions: QuotedNodeTripleMapping, ) -> _check_return_type: # TODO(warenlg): Add current algorithm description. # TODO(vmarkovtsev): Apply ML to not parse all the parents. self._parsing_cache = {} unsafe_preds = [] file_content = file.content.decode("utf-8", "replace") vnodes_i = 0 changes = numpy.where((y_pred != -1) & (y != y_pred))[0] start_offset_to_vnodes = {} end_offset_to_vnodes = {} for i, vnode in enumerate(vnodes): if vnode.start.offset not in start_offset_to_vnodes: # NOOP always included start_offset_to_vnodes[vnode.start.offset] = i for i, vnode in enumerate(vnodes[::-1]): if vnode.end.offset not in end_offset_to_vnodes: # NOOP always included that is why we have reverse order in this loop end_offset_to_vnodes[vnode.end.offset] = len(vnodes) - i for i in changes: vnode_y = vnodes_y[i] while vnode_y is not vnodes[vnodes_i]: vnodes_i += 1 if vnodes_i >= len(vnodes): raise AssertionError("vnodes_y and vnodes are not consistent.") if id(vnode_y) in grouped_quote_predictions: # quote types are special case group = grouped_quote_predictions[id(vnode_y)] if group is None: # already handled with the previous vnode continue vnode1, vnode2, vnode3 = group content_before = file_content[vnode1.start.offset:vnode3.end.offset] content_after = (self._feature_extractor.label_to_str(y_pred[i]) + vnode2.value + self._feature_extractor.label_to_str(y_pred[i + 1])) parsed_before, errors = parse_uast(stub, content_before, filename="", language=self._feature_extractor.language) if not errors: parsed_after, errors = parse_uast(stub, content_after, filename="", language=self._feature_extractor.language) if not self.check_uasts_equivalent(parsed_before, parsed_after): unsafe_preds.append(i) unsafe_preds.append(i + 1) # Second quote continue parsed_before = self._parse_code(vnode_parents[id(vnode_y)], file_content, stub, node_parents, vnode_y.path) if parsed_before is None: continue parent_before, start, end = parsed_before vnode_start_index = start_offset_to_vnodes[start] vnode_end_index = end_offset_to_vnodes[end] assert vnode_start_index <= vnodes_i < vnode_end_index try: content_after = self._code_generator.generate_one_change( vnodes[vnode_start_index:vnode_end_index], vnodes_i - vnode_start_index, y_pred[i]) except CodeGenerationBaseError as e: self._log.debug("Code generator can't generate code: %s", repr(e.args)) unsafe_preds.append(i) continue parent_after, errors_after = parse_uast( stub, content_after, filename="", language=self._feature_extractor.language) if errors_after: unsafe_preds.append(i) continue if not self.check_uasts_equivalent(parent_before, parent_after): if self._debug: self._log.debug( "Bad prediction\nfile:%s\nDiff:\n%s\n\n", vnode_y.path, "\n".join(line for line in difflib.unified_diff( file_content[start:end].splitlines(), content_after.splitlines(), fromfile="original", tofile="suggested", lineterm=""))) unsafe_preds.append(i) self._log.info("%d filtered out of %d with changes", len(unsafe_preds), changes.shape[0]) unsafe_preds = frozenset(unsafe_preds) safe_preds = numpy.array([i for i in range(len(y)) if i not in unsafe_preds]) vnodes_y = [vn for i, vn in enumerate(list(vnodes_y)) if i not in unsafe_preds] return y[safe_preds], y_pred[safe_preds], vnodes_y, rule_winners[safe_preds], safe_preds
def filter_uast_breaking_preds( y: numpy.ndarray, y_pred: numpy.ndarray, vnodes_y: Sequence[VirtualNode], vnodes: Sequence[VirtualNode], files: Mapping[str, File], feature_extractor: FeatureExtractor, stub: "bblfsh.aliases.ProtocolServiceStub", vnode_parents: Mapping[int, bblfsh.Node], node_parents: Mapping[int, bblfsh.Node], rule_winners: numpy.ndarray, grouped_quote_predictions: QuotedNodeTripleMapping, ) -> Tuple[numpy.ndarray, numpy.ndarray, Sequence[VirtualNode], numpy.ndarray, Sequence[int]]: """ Filter the model's predictions that modify the UAST apart from changing positions. :param y: Numpy 1-dimensional array of labels. :param y_pred: Numpy 1-dimensional array of predicted labels by the model. :param vnodes_y: Sequence of the labeled `VirtualNode`-s corresponding to labeled samples. :param vnodes: Sequence of all the `VirtualNode`-s corresponding to the input. :param files: Dictionary of File-s with content, uast and path. :param feature_extractor: FeatureExtractor used to extract features. :param stub: Babelfish GRPC service stub. :param vnode_parents: `VirtualNode`-s' parents mapping as the LCA of the closest left and right babelfish nodes. :param node_parents: Parents mapping of the input UASTs. :param rule_winners: Numpy array of the index of the winning rule for each sample. :param grouped_quote_predictions: Quotes predictions (handled differenlty from the rest). :return: List of predictions indices that are considered valid i.e. that are not breaking the UAST. """ safe_preds = [] current_path = None # type: Optional[str] parsing_cache = { } # type: Dict[int, Optional[Tuple[bblfsh.Node, int, int]]] file_content = None # type: Optional[str] cur_i = 0 for i, (gt, pred, vn_y) in enumerate(zip(y, y_pred, vnodes_y)): if vn_y.path != current_path: parsing_cache = {} current_path = vn_y.path file_content = files[vn_y.path].content.decode("utf-8", "replace") while vn_y is not vnodes[cur_i]: cur_i += 1 if cur_i >= len(vnodes): raise AssertionError("vnodes_y and vnodes are not consistent.") # quote types are special cased if id(vn_y) in grouped_quote_predictions: pred_string = feature_extractor.label_to_str(pred) group = grouped_quote_predictions[id(vn_y)] # already handled with the previous vnode if group is None: continue vnode1, vnode2, vnode3 = group content_before = file_content[vnode1.start.offset:vnode3.end. offset] content_after = (feature_extractor.label_to_str(y_pred[i]) + vnode2.value + feature_extractor.label_to_str(y_pred[i + 1])) parsed_before, errors = parse_uast( stub, content_before, filename="", language=feature_extractor.language) if not errors: parsed_after, errors = parse_uast( stub, content_after, filename="", language=feature_extractor.language) if check_uasts_are_equal(parsed_before, parsed_after): safe_preds.extend((i, i + 1)) continue if gt == pred: safe_preds.append(i) continue pred_string = feature_extractor.label_to_str(pred) parsed_before = _parse_code(vnode_parents[id(vn_y)], file_content, stub, parsing_cache, feature_extractor.language, node_parents, vn_y.path) if parsed_before is None: continue parent_before, start, end = parsed_before # when the input node value is NOOP i.e. an empty string, the replacement is restricted # to the first occurence output_pred = "".join(n.value for n in vnodes[cur_i:cur_i + 2]).replace( vn_y.value, pred_string, 1) diff_pred_offset = len(pred_string) - len(vn_y.value) try: # to handle mixed indentations, we include the `VirtualNode` following the predicted # one in the output predicted string, and start the rest of the sequence one # `VirtualNode` further to avoid its repetitions start_next_vnode = vn_y.start.offset + len(vn_y.value) + len( vnodes[cur_i + 1].value) content_after = (file_content[:vn_y.start.offset] + output_pred + file_content[start_next_vnode:]) # in case the prediction to check corresponds to the last label of a file except IndexError: content_after = file_content[:vn_y.start.offset] \ + output_pred content_after = content_after[start:end + diff_pred_offset] parent_after, errors_after = parse_uast( stub, content_after, filename="", language=feature_extractor.language) if not errors_after: if check_uasts_are_equal(parent_before, parent_after): safe_preds.append(i) _log.info("Non UAST breaking predictions: %d selected out of %d", len(safe_preds), y_pred.shape[0]) vnodes_y = [vn for i, vn in enumerate(list(vnodes_y)) if i in safe_preds] return y[safe_preds], y_pred[safe_preds], vnodes_y, rule_winners[ safe_preds], safe_preds