Ejemplo n.º 1
0
    def test_babelfish_unicode(self):
        content = b"console.log('\xc3\x80');"

        uast_uni, errors_uni = parse_uast(self.data_service.get_bblfsh(),
                                          content.decode(),
                                          "test.js",
                                          unicode=True)
        uast, errors = parse_uast(self.data_service.get_bblfsh(),
                                  content.decode(),
                                  "test.js",
                                  unicode=False)
        self.assertIsInstance(uast, bblfsh.Node)
        self.assertIsInstance(uast_uni, bblfsh.Node)
        self.assertEqual(errors_uni, errors)
        check_uast_transformation(self, content, uast, uast_uni)
Ejemplo n.º 2
0
 def test_babelfish(self):
     uast, errors = parse_uast(self.data_service.get_bblfsh(),
                               "console.log('hi');",
                               "hi.js",
                               unicode=False)
     self.assertIsInstance(uast, bblfsh.Node)
     self.assertEqual(len(errors), 0, str(errors))
Ejemplo n.º 3
0
 def test_multiple_files(self):
     data = [
         ("var a = 0",
          {1: (CLS_NOOP,)}),
         ("var b = 123",
          {4: (CLS_NOOP,)}),
     ]
     files = []
     for i, (code, _) in enumerate(data):
         uast, errors = parse_uast(self.stub, code, filename="", language=self.language,
                                   unicode=True)
         if errors:
             self.fail("Could not parse the testing code.")
         files.append(UnicodeFile(content=code, uast=uast, path="test_file_%d" % i,
                                  language="javascript"))
     X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features(files)
     y_pred = y.copy()
     rule_winners = numpy.zeros(y.shape)
     for (_, modif) in data:
         for i in modif:
             y_pred[i] = self._to_label(modif[i])
     checker = UASTStabilityChecker(self.fe)
     new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
         y, y_pred, vnodes_y, vnodes, files, self.stub, vnode_parents,
         node_parents, rule_winners, grouped_quote_predictions={})
     self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
Ejemplo n.º 4
0
 def edit_and_test(self, code: str, modifs: Mapping[int, Sequence[str]], *,
                   quote_indices: Optional[Tuple[int, ...]] = None,
                   bad_indices: Optional[FrozenSet[int]] = None) -> None:
     uast, errors = parse_uast(self.stub, code, filename="", language=self.language,
                               unicode=True)
     if errors:
         self.fail("Could not parse the testing code.")
     file = UnicodeFile(content=code, uast=uast, path="test_file", language="javascript")
     X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features([file])
     y_pred = y.copy()
     rule_winners = numpy.zeros(y.shape)
     for index, classes in modifs.items():
         y_pred[index] = self._to_label(classes)
     checker = UASTStabilityChecker(self.fe)
     grouped_quote_predictions = self._grouped_predictions_mapping(vnodes, quote_indices)
     new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
         y, y_pred, vnodes_y, vnodes, [file], self.stub, vnode_parents,
         node_parents, rule_winners, grouped_quote_predictions=grouped_quote_predictions)
     bad_preds = set(range(y.shape[0])) - set(safe_preds)
     bad = modifs.keys() if bad_indices is None else bad_indices
     self.assertEqual(bad_preds, bad)
     self.assertEqual(len(y) - len(bad), len(new_y))
     self.assertEqual(len(y_pred) - len(bad), len(new_y_pred))
     self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y))
     self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))
Ejemplo n.º 5
0
def _parse_code(
    parent: bblfsh.Node,
    content: str,
    stub: "bblfsh.aliases.ProtocolServiceStub",
    parsing_cache: MutableMapping[int, Optional[Tuple[bblfsh.Node, int, int]]],
    language: str,
    node_parents: Mapping[int, bblfsh.Node],
    path: str,
) -> Optional[Tuple[bblfsh.Node, int, int]]:
    """
    Find a parent node that Babelfish can parse and parse it.

    Iterates over the parents of the current virtual node until it is parseable and returns the
    parsed UAST or None if it reaches the root without finding a parseable parent.

    The cache will be used to avoid recomputations for parents that have already been considered.

    :param parent: First virtual node to try to parse. Will go up in the tree if it fails.
    :param content: Content of the file.
    :param stub: Babelfish GRPC service stub.
    :param parsing_cache: Cache to avoid the recomputation of the results for already seen nodes.
    :param language: language to use for Babelfish.
    :param node_parents: Parents mapping of the input UASTs.
    :param path: Path of the file being parsed.
    :return: Optional tuple of the parsed UAST and the corresponding starting and ending offsets.
    """
    descendants = []
    current_ancestor = parent
    while current_ancestor is not None:
        if id(current_ancestor) in parsing_cache:
            result = parsing_cache[id(current_ancestor)]
            break
        descendants.append(current_ancestor)
        start, end = (current_ancestor.start_position.offset,
                      current_ancestor.end_position.offset)
        uast, errors = parse_uast(stub,
                                  content[start:end],
                                  filename="",
                                  language=language)
        if not errors:
            result = uast, start, end
            break
        current_ancestor = node_parents.get(id(current_ancestor), None)
    else:
        result = None
        _log.warning(
            "skipped file %s, due to errors in parsing the whole content",
            path)
    for descendant in descendants:
        parsing_cache[id(descendant)] = result
    return result
Ejemplo n.º 6
0
    def _parse_code(self, vnode: VirtualNode, parent: bblfsh.Node, content: str,
                    stub: "bblfsh.aliases.ProtocolServiceStub",
                    node_parents: Mapping[int, bblfsh.Node], path: str,
                    ) -> Optional[Tuple[bblfsh.Node, int, int]]:
        """
        Find a parent node that Babelfish can parse and parse it.

        Iterates over the parents of the current virtual node until it is parsable and returns the
        parsed UAST or None if it reaches the root without finding a parsable parent.

        The cache will be used to avoid recomputations for parents that have already been
        considered.

        :param vnode: Vnode that is modified. Used to check that we retrieve the correct parent.
        :param parent: First virtual node to try to parse. Will go up in the tree if it fails.
        :param content: Content of the file.
        :param stub: Babelfish GRPC service stub.
        :param node_parents: Parents mapping of the input UASTs.
        :param path: Path of the file being parsed.
        :return: tuple of the parsed UAST and the corresponding starting and ending offsets. \
                 None if Babelfish failed to parse the whole file.
        """
        descendants = []
        current_ancestor = parent
        while current_ancestor is not None:
            if id(current_ancestor) in self._parsing_cache:
                result = self._parsing_cache[id(current_ancestor)]
                break
            descendants.append(current_ancestor)
            start, end = (current_ancestor.start_position.offset,
                          current_ancestor.end_position.offset)
            if start <= vnode.start.offset and end > vnode.end.offset:
                uast, errors = parse_uast(stub, content[start:end], filename="", unicode=True,
                                          language=self._feature_extractor.language)
                if not errors:
                    result = uast, start, end
                    break
            current_ancestor = node_parents.get(id(current_ancestor), None)
        else:
            result = None
            self._log.warning("skipped file %s, due to errors in parsing the whole content", path)
        for descendant in descendants:
            self._parsing_cache[id(descendant)] = result
        return result
Ejemplo n.º 7
0
    def _check_file(
            self, y: numpy.ndarray, y_pred: numpy.ndarray, vnodes_y: Sequence[VirtualNode],
            vnodes: Sequence[VirtualNode], file: File, stub: "bblfsh.aliases.ProtocolServiceStub",
            vnode_parents: Mapping[int, bblfsh.Node], node_parents: Mapping[int, bblfsh.Node],
            rule_winners: numpy.ndarray, grouped_quote_predictions: QuotedNodeTripleMapping,
    ) -> _check_return_type:
        # TODO(warenlg): Add current algorithm description.
        # TODO(vmarkovtsev): Apply ML to not parse all the parents.
        self._parsing_cache = {}
        unsafe_preds = []
        file_content = file.content.decode("utf-8", "replace")
        vnodes_i = 0
        changes = numpy.where((y_pred != -1) & (y != y_pred))[0]
        start_offset_to_vnodes = {}
        end_offset_to_vnodes = {}
        for i, vnode in enumerate(vnodes):
            if vnode.start.offset not in start_offset_to_vnodes:
                # NOOP always included
                start_offset_to_vnodes[vnode.start.offset] = i
        for i, vnode in enumerate(vnodes[::-1]):
            if vnode.end.offset not in end_offset_to_vnodes:
                # NOOP always included that is why we have reverse order in this loop
                end_offset_to_vnodes[vnode.end.offset] = len(vnodes) - i
        for i in changes:
            vnode_y = vnodes_y[i]
            while vnode_y is not vnodes[vnodes_i]:
                vnodes_i += 1
                if vnodes_i >= len(vnodes):
                    raise AssertionError("vnodes_y and vnodes are not consistent.")
            if id(vnode_y) in grouped_quote_predictions:
                # quote types are special case
                group = grouped_quote_predictions[id(vnode_y)]
                if group is None:
                    # already handled with the previous vnode
                    continue
                vnode1, vnode2, vnode3 = group
                content_before = file_content[vnode1.start.offset:vnode3.end.offset]
                content_after = (self._feature_extractor.label_to_str(y_pred[i]) + vnode2.value +
                                 self._feature_extractor.label_to_str(y_pred[i + 1]))
                parsed_before, errors = parse_uast(stub, content_before, filename="",
                                                   language=self._feature_extractor.language)
                if not errors:
                    parsed_after, errors = parse_uast(stub, content_after, filename="",
                                                      language=self._feature_extractor.language)
                    if not self.check_uasts_equivalent(parsed_before, parsed_after):
                        unsafe_preds.append(i)
                        unsafe_preds.append(i + 1)  # Second quote
                continue

            parsed_before = self._parse_code(vnode_parents[id(vnode_y)], file_content, stub,
                                             node_parents, vnode_y.path)
            if parsed_before is None:
                continue
            parent_before, start, end = parsed_before
            vnode_start_index = start_offset_to_vnodes[start]
            vnode_end_index = end_offset_to_vnodes[end]

            assert vnode_start_index <= vnodes_i < vnode_end_index
            try:
                content_after = self._code_generator.generate_one_change(
                    vnodes[vnode_start_index:vnode_end_index],
                    vnodes_i - vnode_start_index, y_pred[i])
            except CodeGenerationBaseError as e:
                self._log.debug("Code generator can't generate code: %s", repr(e.args))
                unsafe_preds.append(i)
                continue
            parent_after, errors_after = parse_uast(
                stub, content_after, filename="", language=self._feature_extractor.language)
            if errors_after:
                unsafe_preds.append(i)
                continue
            if not self.check_uasts_equivalent(parent_before, parent_after):
                if self._debug:
                    self._log.debug(
                        "Bad prediction\nfile:%s\nDiff:\n%s\n\n", vnode_y.path,
                        "\n".join(line for line in difflib.unified_diff(
                            file_content[start:end].splitlines(), content_after.splitlines(),
                            fromfile="original", tofile="suggested", lineterm="")))
                unsafe_preds.append(i)
        self._log.info("%d filtered out of %d with changes", len(unsafe_preds), changes.shape[0])
        unsafe_preds = frozenset(unsafe_preds)
        safe_preds = numpy.array([i for i in range(len(y)) if i not in unsafe_preds])
        vnodes_y = [vn for i, vn in enumerate(list(vnodes_y)) if i not in unsafe_preds]
        return y[safe_preds], y_pred[safe_preds], vnodes_y, rule_winners[safe_preds], safe_preds
Ejemplo n.º 8
0
def filter_uast_breaking_preds(
    y: numpy.ndarray,
    y_pred: numpy.ndarray,
    vnodes_y: Sequence[VirtualNode],
    vnodes: Sequence[VirtualNode],
    files: Mapping[str, File],
    feature_extractor: FeatureExtractor,
    stub: "bblfsh.aliases.ProtocolServiceStub",
    vnode_parents: Mapping[int, bblfsh.Node],
    node_parents: Mapping[int, bblfsh.Node],
    rule_winners: numpy.ndarray,
    grouped_quote_predictions: QuotedNodeTripleMapping,
) -> Tuple[numpy.ndarray, numpy.ndarray, Sequence[VirtualNode], numpy.ndarray,
           Sequence[int]]:
    """
    Filter the model's predictions that modify the UAST apart from changing positions.

    :param y: Numpy 1-dimensional array of labels.
    :param y_pred: Numpy 1-dimensional array of predicted labels by the model.
    :param vnodes_y: Sequence of the labeled `VirtualNode`-s corresponding to labeled samples.
    :param vnodes: Sequence of all the `VirtualNode`-s corresponding to the input.
    :param files: Dictionary of File-s with content, uast and path.
    :param feature_extractor: FeatureExtractor used to extract features.
    :param stub: Babelfish GRPC service stub.
    :param vnode_parents: `VirtualNode`-s' parents mapping as the LCA of the closest
                           left and right babelfish nodes.
    :param node_parents: Parents mapping of the input UASTs.
    :param rule_winners: Numpy array of the index of the winning rule for each sample.
    :param grouped_quote_predictions: Quotes predictions (handled differenlty from the rest).
    :return: List of predictions indices that are considered valid i.e. that are not breaking
             the UAST.
    """
    safe_preds = []
    current_path = None  # type: Optional[str]
    parsing_cache = {
    }  # type: Dict[int, Optional[Tuple[bblfsh.Node, int, int]]]
    file_content = None  # type: Optional[str]
    cur_i = 0
    for i, (gt, pred, vn_y) in enumerate(zip(y, y_pred, vnodes_y)):
        if vn_y.path != current_path:
            parsing_cache = {}
            current_path = vn_y.path
            file_content = files[vn_y.path].content.decode("utf-8", "replace")
        while vn_y is not vnodes[cur_i]:
            cur_i += 1
            if cur_i >= len(vnodes):
                raise AssertionError("vnodes_y and vnodes are not consistent.")
        # quote types are special cased
        if id(vn_y) in grouped_quote_predictions:
            pred_string = feature_extractor.label_to_str(pred)
            group = grouped_quote_predictions[id(vn_y)]
            # already handled with the previous vnode
            if group is None:
                continue
            vnode1, vnode2, vnode3 = group
            content_before = file_content[vnode1.start.offset:vnode3.end.
                                          offset]
            content_after = (feature_extractor.label_to_str(y_pred[i]) +
                             vnode2.value +
                             feature_extractor.label_to_str(y_pred[i + 1]))
            parsed_before, errors = parse_uast(
                stub,
                content_before,
                filename="",
                language=feature_extractor.language)
            if not errors:
                parsed_after, errors = parse_uast(
                    stub,
                    content_after,
                    filename="",
                    language=feature_extractor.language)
                if check_uasts_are_equal(parsed_before, parsed_after):
                    safe_preds.extend((i, i + 1))
            continue
        if gt == pred:
            safe_preds.append(i)
            continue
        pred_string = feature_extractor.label_to_str(pred)
        parsed_before = _parse_code(vnode_parents[id(vn_y)], file_content,
                                    stub, parsing_cache,
                                    feature_extractor.language, node_parents,
                                    vn_y.path)
        if parsed_before is None:
            continue
        parent_before, start, end = parsed_before
        # when the input node value is NOOP i.e. an empty string, the replacement is restricted
        # to the first occurence
        output_pred = "".join(n.value
                              for n in vnodes[cur_i:cur_i + 2]).replace(
                                  vn_y.value, pred_string, 1)
        diff_pred_offset = len(pred_string) - len(vn_y.value)
        try:
            # to handle mixed indentations, we include the `VirtualNode` following the predicted
            # one in the output predicted string, and start the rest of the sequence one
            # `VirtualNode` further to avoid its repetitions
            start_next_vnode = vn_y.start.offset + len(vn_y.value) + len(
                vnodes[cur_i + 1].value)
            content_after = (file_content[:vn_y.start.offset] + output_pred +
                             file_content[start_next_vnode:])
        # in case the prediction to check corresponds to the last label of a file
        except IndexError:
            content_after = file_content[:vn_y.start.offset] \
                + output_pred
        content_after = content_after[start:end + diff_pred_offset]
        parent_after, errors_after = parse_uast(
            stub,
            content_after,
            filename="",
            language=feature_extractor.language)
        if not errors_after:
            if check_uasts_are_equal(parent_before, parent_after):
                safe_preds.append(i)
    _log.info("Non UAST breaking predictions: %d selected out of %d",
              len(safe_preds), y_pred.shape[0])
    vnodes_y = [vn for i, vn in enumerate(list(vnodes_y)) if i in safe_preds]
    return y[safe_preds], y_pred[safe_preds], vnodes_y, rule_winners[
        safe_preds], safe_preds