def _split_vnodes_by_lines(vnodes: List[VirtualNode]) -> Iterator: """ Split VirtualNode to several one-line VirtualNode if it is placed on several lines. New line character concatenated to the next line. It is applied to vnodes with y=None only. """ stack = vnodes[::-1] while stack: vnode = stack.pop() value_lines = vnode.value.splitlines() if vnode.y is not None or len(value_lines) <= 1: yield vnode continue if value_lines[0] == "": # if there is only end of line characters we concatenate it to the next line next_line = value_lines[1] if len(value_lines) > 1 else "" value1 = vnode.value.splitlines(keepends=True)[0] + next_line middle = Position(offset=vnode.start.offset + len(value1), line=vnode.start.line + 1, col=1 + len(next_line)) else: value1 = value_lines[0] middle = Position(offset=vnode.start.offset + len(value1), line=vnode.start.line, col=vnode.start.col + len(value1)) value2 = vnode.value[len(value1):] if value2: # value2 can be multi-line so we put it back stack.append(VirtualNode(value=value2, start=middle, end=vnode.end, node=vnode.node)) yield VirtualNode(value=value1, start=vnode.start, end=middle, node=vnode.node)
def _add_noops(self, vnodes: Sequence[VirtualNode], path: str, index_labels: bool = False, ) -> List[VirtualNode]: """ Add CLS_NOOP nodes in between tokens without labeled nodes to allow for insertions. :param vnodes: The sequence of `VirtualNode`-s to augment with noop nodes. :param path: path to file. :param index_labels: Whether to index labels to define output classes or not. :return: The augmented `VirtualNode`-s sequence. """ augmented_vnodes = [] noop_label = (CLASS_INDEX[CLS_NOOP],) if not len(vnodes): return augmented_vnodes if vnodes[0].y is None: augmented_vnodes.append(VirtualNode(value="", start=Position(0, 1, 1), end=Position(0, 1, 1), y=noop_label, path=path)) for vnode, next_vnode in zip(vnodes, islice(vnodes, 1, None)): augmented_vnodes.append(vnode) if vnode.y is None and not vnode.is_accumulated_indentation and next_vnode.y is None: augmented_vnodes.append(VirtualNode(value="", start=vnode.end, end=vnode.end, y=noop_label, path=path)) augmented_vnodes.append(next_vnode) if augmented_vnodes[-1].y is None: augmented_vnodes.append(VirtualNode(value="", start=vnodes[-1].end, end=vnodes[-1].end, y=noop_label, path=path)) return augmented_vnodes
def test_revert_indentation_change(self): cases = [ ("\n ", (cls.CLS_NEWLINE, cls.CLS_SPACE_INC, cls.CLS_SPACE_INC), "\n "), ("\n ", (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC, cls.CLS_SPACE_DEC), "\n "), ("\n\t ", (cls.CLS_NEWLINE, cls.CLS_TAB_INC, cls.CLS_SPACE_INC), "\n"), ("\n ", (cls.CLS_NEWLINE, cls.CLS_TAB_INC, cls.CLS_TAB_INC), InapplicableIndentation), (" ", (cls.CLS_SPACE, cls.CLS_SPACE_INC, cls.CLS_SPACE_INC), ValueError), ] for value, y, result in cases: vnode = VirtualNode(value, Position(0, 1, 1), Position(len(value), 1, len(value) + 1), y=tuple(cls.CLASS_INDEX[i] for i in y)) if isinstance(result, str): self.assertEqual( CodeGenerator.revert_indentation_change(vnode), result) else: with self.assertRaises(result): CodeGenerator.revert_indentation_change(vnode)
def test_compute_labels_mappings(self): pos1, pos2 = Position(1, 1, 1), Position(10, 2, 1) files = [VirtualNode("", pos1, pos2, y=(1,))] * 2 + \ [VirtualNode("", pos1, pos2), VirtualNode("", pos1, pos2, y=(2,)), VirtualNode("", pos1, pos2, y=(3,))] self.extractor.cutoff_label_support = 2 self.extractor._compute_labels_mappings(files) self.assertEqual(self.extractor.labels_to_class_sequences, [(1,)]) self.assertEqual(self.extractor.class_sequences_to_labels, {(1,): 0})
def _to_position(raw_lines_data, _lines_start_offset, offset): line_num = numpy.argmax(_lines_start_offset > offset) - 1 col = offset - _lines_start_offset[line_num] line = raw_lines_data[line_num] if len(line) == col: if line.splitlines()[0] != line: # ends with newline line_num += 1 col = 0 return Position(offset, line_num + 1, col + 1)
def _class_seq_to_vnodes(value, start, end, current_class_seq, path): if NEWLINE_INDEX not in current_class_seq or \ current_class_seq[0] == NEWLINE_INDEX: # if there are no trailing whitespaces or tabs yield VirtualNode(value=value, start=start, end=end, y=tuple(current_class_seq), path=path) else: index = current_class_seq.index(NEWLINE_INDEX) middle = Position(start.offset + index, start.line, start.col + index) yield VirtualNode(value=value[:index], start=start, end=middle, y=tuple(current_class_seq[:index]), path=path) yield VirtualNode(value=value[index:], start=middle, end=end, y=tuple(current_class_seq[index:]), path=path)
def _parse_file(self, contents: str, root: bblfsh.Node, path: str) -> \ Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]: """ Parse a file into a sequence of `VirtuaNode`-s and a mapping from VirtualNode to parent. Given the source text and the corresponding UAST this function compiles the list of `VirtualNode`-s and the parents mapping. That list of nodes equals to the original source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from `id(node)` to its parent `bblfsh.Node`. :param contents: source file text :param root: UAST root node :param path: path to the file, used for debugging :return: list of `VirtualNode`-s and the parents. """ # build the line mapping lines = contents.splitlines(keepends=True) # Check if there is a newline in the end of file. Yes, you can just check # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for # new line this condition gives wrong result. eof_new_line = lines[-1].splitlines()[0] != lines[-1] if eof_new_line: # We add last line as empty one because it actually exists, but .splitlines() does not # return it. lines.append("") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) line_offsets[-1] = pos + 1 # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() if node.internal_type in self.node_fixtures: node = self.node_fixtures[node.internal_type](node) for child in node.children: parents[id(child)] = node queue.extend(node.children) if (node.token or node.start_position and node.end_position and node.start_position != node.end_position and not node.children): node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes result = [] pos = 0 parser = self.tokens.PARSER searchsorted = numpy.searchsorted for node in node_tokens: if node.start_position.offset < pos: continue if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): positions = [] for suboff in (match.start(), match.end()): offset = pos + suboff line = searchsorted(line_offsets, offset, side="right") col = offset - line_offsets[line - 1] + 1 positions.append(Position(offset, line, col)) token = sumlen += len(token) result.append(VirtualNode(token, *positions, path=path)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break result.extend(VirtualNode.from_node(node, contents, path, self.token_unwrappers)) pos = node.end_position.offset return result, parents
def _classify_vnodes(self, nodes: Iterable[VirtualNode], path: str) -> Iterable[VirtualNode]: """ Fill "y" attribute in the VirtualNode-s extracted from _parse_file(). It is the index of the corresponding class to predict. We detect indentation changes so several whitespace nodes are merged together. :param nodes: sequence of VirtualNodes. :param path: path to file. :return: new list of VirtualNodes, the size is different from the original. """ indentation = [] for node in nodes: if node.node is not None: yield node continue if not node.value.isspace(): if node.value == "'": node.y = (CLASS_INDEX[CLS_SINGLE_QUOTE],) elif node.value == '"': node.y = (CLASS_INDEX[CLS_DOUBLE_QUOTE],) yield node continue lines = node.value.splitlines(keepends=True) if lines[-1].splitlines()[0] != lines[-1]: # We add last line as empty one to mimic .split("\n") behaviour lines.append("") if len(lines) == 1: # only tabs and spaces are possible for i, char in enumerate(node.value): if char == "\t": cls = (CLASS_INDEX[CLS_TAB],) else: cls = (CLASS_INDEX[CLS_SPACE],) offset, lineno, col = node.start yield VirtualNode( char, Position(offset + i, lineno, col + i), Position(offset + i + 1, lineno, col + i + 1), y=cls, path=path) continue line_offset = 0 for i, line in enumerate(lines[:-1]): # `line` contains trailing whitespaces, we add it to the newline node start_offset = node.start.offset + line_offset start_col = node.start.col if i == 0 else 1 lineno = node.start.line + i yield VirtualNode( line, Position(start_offset, lineno, start_col), Position(start_offset + len(line), lineno + 1, 1), y=(CLASS_INDEX[CLS_NEWLINE],), path=path) line_offset += len(line) line = lines[-1].splitlines()[0] if lines[-1] else "" my_indent = list(line) offset, lineno, col = node.end offset -= len(line) col -= len(line) try: for ws in indentation: my_indent.remove(ws) except ValueError: if my_indent: # mixed tabs and spaces, do not classify yield VirtualNode( line, Position(offset, lineno, col), node.end, path=path) continue # indentation decreases for char in indentation[len(line):]: if char == "\t": cls = (CLASS_INDEX[CLS_TAB_DEC],) else: cls = (CLASS_INDEX[CLS_SPACE_DEC],) yield VirtualNode( "", Position(offset, lineno, col), Position(offset, lineno, col), y=cls, path=path) indentation = indentation[:len(line)] if indentation: yield VirtualNode( "".join(indentation), Position(offset, lineno, col), node.end, is_accumulated_indentation=True, path=path) else: # indentation is stable or increases for i, char in enumerate(my_indent): if char == "\t": cls = (CLASS_INDEX[CLS_TAB_INC],) else: cls = (CLASS_INDEX[CLS_SPACE_INC],) yield VirtualNode( char, Position(offset + i, lineno, col + i), Position(offset + i + 1, lineno, col + i + 1), y=cls, path=path) offset += len(my_indent) col += len(my_indent) if indentation: yield VirtualNode( "".join(indentation), Position(offset, lineno, col), Position(offset + len(indentation), lineno, col + len(indentation)), is_accumulated_indentation=True, path=path) for char in my_indent: indentation.append(char)
def test_template(self): class FakeRules: rules = {34: "<rule # 34>"} class FakeModel: def __getitem__(self, item): return FakeRules() class FakeHeadFile: content = b"<first code line>\n<second code line>\n<third code line>" def fake_partitial(func, *_, **__): if func == descriptions.describe_rule: def fake_describe_rule(rule, *_, **__): return rule return fake_describe_rule def fake_get_change_description(*_, **__): return "<change description>" return fake_get_change_description comment_template_flie = os.path.join(os.path.dirname(__file__), "..", "templates", "comment.jinja2") config = { "report_code_lines": True, "report_triggered_rules": True, "comment_template": comment_template_flie, } analyzer = FormatAnalyzer(config=config, model=FakeModel(), url="") language = "<language>" line_number = 2 suggested_code = "<new code line>" partial_backup = functools.partial vnode = VirtualNode(start=Position(10, 2, 1), end=Position(12, 3, 1), value="!", y=(1,)) vnode.applied_rule = FakeRules.rules[34] line_fix = LineFix( line_number=line_number, suggested_code=suggested_code, fixed_vnodes=[vnode], confidence=100) file_fix = FileFix(error="", line_fixes=[line_fix], language=language, base_file=None, feature_extractor=None, file_vnodes=[], head_file=FakeHeadFile, y=None, y_pred_pure=None) try: functools.partial = fake_partitial text = analyzer.render_comment_text(file_fix, 0) res = """format: style mismatch: ```<language> 1|<first code line> 2|<second code line> 3|<third code line> ``` ```suggestion <new code line> ``` <change description> Triggered rule ``` <rule # 34> ``` """ self.assertEqual(text, res) finally: functools.partial = partial_backup
def test_apply_new_indentation(self): cases = [ ("\n ", ("\n", " "), (cls.CLS_NEWLINE, cls.CLS_SPACE_INC, cls.CLS_SPACE_INC), (cls.CLS_NEWLINE, ), ""), ("\n ", ("\n", " "), (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC, cls.CLS_SPACE_DEC), (cls.CLS_NEWLINE, ), ""), ("\n\t ", ("\n", ""), (cls.CLS_NEWLINE, cls.CLS_TAB_INC, cls.CLS_SPACE_INC), (cls.CLS_NEWLINE, ), ""), ("\n ", InapplicableIndentation, (cls.CLS_NEWLINE, cls.CLS_TAB_INC, cls.CLS_TAB_INC), (cls.CLS_NEWLINE, ), ""), ("\n ", ValueError, (cls.CLS_NEWLINE, cls.CLS_SPACE, cls.CLS_SPACE_INC, cls.CLS_SPACE_INC), (cls.CLS_NEWLINE, ), ""), ("\n\t ", InapplicableIndentation, (cls.CLS_NEWLINE, cls.CLS_SPACE_INC, cls.CLS_SPACE_INC), (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC), ""), ("\n\t ", ValueError, (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC), (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC, cls.CLS_SPACE, cls.CLS_SPACE_DEC), ""), ("\n\n ", ("\n", " "), (cls.CLS_NEWLINE, cls.CLS_NEWLINE, cls.CLS_SPACE_DEC), (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC, cls.CLS_SPACE_DEC, cls.CLS_SPACE_DEC), ""), ("", ("\n", " "), (cls.CLS_NOOP, ), (cls.CLS_NEWLINE, ), " "), ("", ("\n\n", ""), (cls.CLS_NOOP, ), (cls.CLS_NEWLINE, cls.CLS_NEWLINE), ""), ] for value, result, y_old, y, last_ident in cases: vnode = VirtualNode(value, Position(0, 1, 1), Position(len(y), 1, len(y) + 1), y=tuple(cls.CLASS_INDEX[i] for i in y)) vnode.y_old = tuple(cls.CLASS_INDEX[i] for i in y_old) if isinstance(result, tuple): self.assertEqual( CodeGenerator.apply_new_indentation(vnode, last_ident), result) else: with self.assertRaises(result): CodeGenerator.apply_new_indentation(vnode, last_ident) msg = None def _warning(*args): nonlocal msg msg = args[0] try: backup_warning = CodeGenerator._log.warning CodeGenerator._log.warning = _warning vnode = VirtualNode( "\n ", Position(0, 1, 1), Position(3, 1, 4), y=tuple(cls.CLASS_INDEX[i] for i in (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC, cls.CLS_SPACE_DEC, cls.CLS_SPACE_DEC))) vnode.y_old = tuple(cls.CLASS_INDEX[i] for i in (cls.CLS_NEWLINE, cls.CLS_SPACE_DEC)) CodeGenerator.apply_new_indentation(vnode, "") expected_msg = "There is no indentation characters left to decrease for vnode" self.assertEqual(msg[:len(expected_msg)], expected_msg) finally: CodeGenerator._log.warning = backup_warning