def get_start_pos_of_prefix(self): previous_leaf = self.get_previous_leaf() if previous_leaf is None: lines = split_lines(self.prefix) # + 1 is needed because split_lines always returns at least ['']. return self.line - len(lines) + 1, 0 # It's the first leaf. return previous_leaf.end_pos
def close(self): self._base_node.finish() # Add an endmarker. try: last_leaf = self._module.get_last_leaf() except IndexError: end_pos = [1, 0] else: last_leaf = _skip_dedent_error_leaves(last_leaf) end_pos = list(last_leaf.end_pos) lines = split_lines(self.prefix) assert len(lines) > 0 if len(lines) == 1: if lines[0].startswith(BOM_UTF8_STRING) and end_pos == [1, 0]: end_pos[1] -= 1 end_pos[1] += len(lines[0]) else: end_pos[0] += len(lines) - 1 end_pos[1] = len(lines[-1]) endmarker = EndMarker('', tuple(end_pos), self.prefix + self._prefix_remainder) endmarker.parent = self._module self._module.children.append(endmarker)
def end_pos(self): lines = split_lines(self.value) end_pos_line = self.line + len(lines) - 1 # Check for multiline token if self.line == end_pos_line: end_pos_column = self.column + len(lines[-1]) else: end_pos_column = len(lines[-1]) return end_pos_line, end_pos_column
def _get_debug_error_message(module, old_lines, new_lines): current_lines = split_lines(module.get_code(), keepends=True) current_diff = difflib.unified_diff(new_lines, current_lines) old_new_diff = difflib.unified_diff(old_lines, new_lines) import marso return ( "There's an issue with the diff parser. Please " "report (marso v%s) - Old/New:\n%s\nActual Diff (May be empty):\n%s" % (marso.__version__, ''.join(old_new_diff), ''.join(current_diff)))
def test_open_string_literal(each_version, code): """ Testing mostly if removing the last newline works. """ lines = split_lines(code, keepends=True) end_pos = (len(lines), len(lines[-1])) module = parse(code, version=each_version) assert module.get_code() == code assert module.end_pos == end_pos == module.children[1].end_pos
def initialize(self, code): logging.debug('differ: initialize') try: del cache.parser_cache[self.grammar._hashed][None] except KeyError: pass self.lines = split_lines(code, keepends=True) self.module = parse(code, diff_cache=True, cache=True) assert code == self.module.get_code() _assert_valid_graph(self.module) return self.module
def get_start_pos_of_prefix(self): """ Basically calls :py:meth:`marso.tree.NodeOrLeaf.get_start_pos_of_prefix`. """ # TODO it is really ugly that we have to override it. Maybe change # indent error leafs somehow? No idea how, though. previous_leaf = self.get_previous_leaf() if previous_leaf is not None and previous_leaf.type == 'error_leaf' \ and previous_leaf.token_type in ('INDENT', 'DEDENT', 'ERROR_DEDENT'): previous_leaf = previous_leaf.get_previous_leaf() if previous_leaf is None: # It's the first leaf. lines = split_lines(self.prefix) # + 1 is needed because split_lines always returns at least ['']. return self.line - len(lines) + 1, 0 # It's the first leaf. return previous_leaf.end_pos
def _assert_valid_graph(node): """ Checks if the parent/children relationship is correct. This is a check that only runs during debugging/testing. """ try: children = node.children except AttributeError: # Ignore INDENT is necessary, because indent/dedent tokens don't # contain value/prefix and are just around, because of the tokenizer. if node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS: assert not node.value assert not node.prefix return # Calculate the content between two start positions. previous_leaf = _get_previous_leaf_if_indentation( node.get_previous_leaf()) if previous_leaf is None: content = node.prefix previous_start_pos = 1, 0 else: assert previous_leaf.end_pos <= node.start_pos, \ (previous_leaf, node) content = previous_leaf.value + node.prefix previous_start_pos = previous_leaf.start_pos if '\n' in content or '\r' in content: splitted = split_lines(content) line = previous_start_pos[0] + len(splitted) - 1 actual = line, len(splitted[-1]) else: actual = previous_start_pos[0], previous_start_pos[1] + len( content) if content.startswith(BOM_UTF8_STRING) \ and node.get_start_pos_of_prefix() == (1, 0): # Remove the byte order mark actual = actual[0], actual[1] - 1 assert node.start_pos == actual, (node.start_pos, actual) else: for child in children: assert child.parent == node, (node, child) _assert_valid_graph(child)
def get_last_line(self, suffix): line = 0 if self._children_groups: children_group = self._children_groups[-1] last_leaf = _get_previous_leaf_if_indentation( children_group.last_line_offset_leaf) line = last_leaf.end_pos[0] + children_group.line_offset # Newlines end on the next line, which means that they would cover # the next line. That line is not fully parsed at this point. if _ends_with_newline(last_leaf, suffix): line -= 1 line += len(split_lines(suffix)) - 1 if suffix and not suffix.endswith('\n') and not suffix.endswith('\r'): # This is the end of a file (that doesn't end with a newline). line += 1 if self._node_children: return max(line, self._node_children[-1].get_last_line(suffix)) return line
def parse(self, code, copies=0, parsers=0, expect_error_leaves=False): logging.debug('differ: parse copies=%s parsers=%s', copies, parsers) lines = split_lines(code, keepends=True) diff_parser = DiffParser( self.grammar._pgen_grammar, self.grammar._tokenizer, self.module, ) new_module = diff_parser.update(self.lines, lines) self.lines = lines assert code == new_module.get_code() _assert_valid_graph(new_module) without_diff_parser_module = parse(code) _assert_nodes_are_equal(new_module, without_diff_parser_module) error_node = _check_error_leaves_nodes(new_module) assert expect_error_leaves == (error_node is not None), error_node if parsers is not ANY: assert diff_parser._parser_count == parsers if copies is not ANY: assert diff_parser._copy_count == copies return new_module
def tokenize(code, version_info, start_pos=(1, 0)): """Generate tokens from a the source code (string).""" lines = split_lines(code, keepends=True) return tokenize_lines(lines, version_info, start_pos=start_pos)
def end_pos(self): lines = split_lines(self.string) if len(lines) > 1: return self.start_pos[0] + len(lines) - 1, 0 else: return self.start_pos[0], self.start_pos[1] + len(self.string)
def insert_line_into_code(code, index, line): lines = split_lines(code, keepends=True) lines.insert(index, line) return ''.join(lines)
def test_split_lines(string, expected_result, keepends): assert split_lines(string, keepends=keepends) == expected_result
def _copy_nodes(self, working_stack, nodes, until_line, line_offset, prefix='', is_nested=False): new_nodes = [] added_indents = [] nodes = list( self._get_matching_indent_nodes( nodes, is_new_suite=is_nested, )) new_prefix = '' for node in nodes: if node.start_pos[0] > until_line: break if node.type == 'endmarker': break if node.type == 'error_leaf' and node.token_type in ( 'DEDENT', 'ERROR_DEDENT'): break # TODO this check might take a bit of time for large files. We # might want to change this to do more intelligent guessing or # binary search. if _get_last_line(node) > until_line: # We can split up functions and classes later. if _func_or_class_has_suite(node): new_nodes.append(node) break try: c = node.children except AttributeError: pass else: # This case basically appears with error recovery of one line # suites like `def foo(): bar.-`. In this case we might not # include a newline in the statement and we need to take care # of that. n = node if n.type == 'decorated': n = n.children[-1] if n.type in ('async_funcdef', 'async_stmt'): n = n.children[-1] if n.type in ('classdef', 'funcdef'): suite_node = n.children[-1] else: suite_node = c[-1] if suite_node.type in ('error_leaf', 'error_node'): break new_nodes.append(node) # Pop error nodes at the end from the list if new_nodes: while new_nodes: last_node = new_nodes[-1] if (last_node.type in ('error_leaf', 'error_node') or _is_flow_node(new_nodes[-1])): # Error leafs/nodes don't have a defined start/end. Error # nodes might not end with a newline (e.g. if there's an # open `(`). Therefore ignore all of them unless they are # succeeded with valid parser state. # If we copy flows at the end, they might be continued # after the copy limit (in the new parser). # In this while loop we try to remove until we find a newline. new_prefix = '' new_nodes.pop() while new_nodes: last_node = new_nodes[-1] if last_node.get_last_leaf().type == 'newline': break new_nodes.pop() continue if len(new_nodes) > 1 and new_nodes[-2].type == 'error_node': # The problem here is that Marso error recovery sometimes # influences nodes before this node. # Since the new last node is an error node this will get # cleaned up in the next while iteration. new_nodes.pop() continue break if not new_nodes: return [], working_stack, prefix, added_indents tos = working_stack[-1] last_node = new_nodes[-1] had_valid_suite_last = False # Pop incomplete suites from the list if _func_or_class_has_suite(last_node): suite = last_node while suite.type != 'suite': suite = suite.children[-1] indent = _get_suite_indentation(suite) added_indents.append(indent) suite_tos = _NodesTreeNode(suite, indentation=_get_indentation(last_node)) # Don't need to pass line_offset here, it's already done by the # parent. suite_nodes, new_working_stack, new_prefix, ai = self._copy_nodes( working_stack + [suite_tos], suite.children, until_line, line_offset, is_nested=True, ) added_indents += ai if len(suite_nodes) < 2: # A suite only with newline is not valid. new_nodes.pop() new_prefix = '' else: assert new_nodes tos.add_child_node(suite_tos) working_stack = new_working_stack had_valid_suite_last = True if new_nodes: if not _ends_with_newline(new_nodes[-1].get_last_leaf() ) and not had_valid_suite_last: p = new_nodes[-1].get_next_leaf().prefix # We are not allowed to remove the newline at the end of the # line, otherwise it's going to be missing. This happens e.g. # if a bracket is around before that moves newlines to # prefixes. new_prefix = split_lines(p, keepends=True)[0] if had_valid_suite_last: last = new_nodes[-1] if last.type == 'decorated': last = last.children[-1] if last.type in ('async_funcdef', 'async_stmt'): last = last.children[-1] last_line_offset_leaf = last.children[-2].get_last_leaf() assert last_line_offset_leaf == ':' else: last_line_offset_leaf = new_nodes[-1].get_last_leaf() tos.add_tree_nodes( prefix, new_nodes, line_offset, last_line_offset_leaf, ) prefix = new_prefix self._prefix_remainder = '' return new_nodes, working_stack, prefix, added_indents
def _parse(self, code=None, error_recovery=True, path=None, start_symbol=None, cache=False, diff_cache=False, cache_path=None, file_io=None, start_pos=(1, 0)): """ Wanted python3.5 * operator and keyword only arguments. Therefore just wrap it all. start_pos here is just a parameter internally used. Might be public sometime in the future. """ if code is None and path is None and file_io is None: raise TypeError("Please provide either code or a path.") if start_symbol is None: start_symbol = self._start_nonterminal if error_recovery and start_symbol != 'file_input': raise NotImplementedError("This is currently not implemented.") if file_io is None: if code is None: file_io = FileIO(path) else: file_io = KnownContentFileIO(path, code) if cache and file_io.path is not None: module_node = load_module(self._hashed, file_io, cache_path=cache_path) if module_node is not None: return module_node if code is None: code = file_io.read() code = python_bytes_to_unicode(code) lines = split_lines(code, keepends=True) if diff_cache and self._diff_parser is not None: try: module_cache_item = parser_cache[self._hashed][file_io.path] except KeyError: pass else: module_node = module_cache_item.node old_lines = module_cache_item.lines if old_lines == lines: return module_node new_node = self._diff_parser( self._pgen_grammar, self._tokenizer, module_node).update(old_lines=old_lines, new_lines=lines) save_module( self._hashed, file_io, new_node, lines, # Never pickle in pypy, it's slow as hell. pickling=cache and not is_pypy, cache_path=cache_path) return new_node tokens = self._tokenizer(lines, start_pos=start_pos) p = self._parser(self._pgen_grammar, error_recovery=error_recovery, start_nonterminal=start_symbol) # breakpoint() root_node = p.parse(tokens=tokens) if cache or diff_cache: save_module( self._hashed, file_io, root_node, lines, # Never pickle in pypy, it's slow as hell. pickling=cache and not is_pypy, cache_path=cache_path) return root_node