def __init__(self, codestr: str, astdict: AstDict) -> None: self._astdict = astdict # Tokenize and create the noop extractor and the position fixer self._tokens: List[Token] = [Token(*i) for i in tokenize.tokenize(BytesIO(codestr.encode('utf-8')).readline)] token_lines = create_tokenized_lines(codestr, self._tokens) self.noops_sync = NoopExtractor(codestr, token_lines) self.pos_sync = LocationFixer(codestr, token_lines) self.codestr = codestr # This will store a dict of nodes to end positions, it will be filled # on parse() self._node2endpos = None self.visit_Global = self.visit_Nonlocal = self._promote_names
class AstImprover(): def __init__(self, codestr: str, astdict: AstDict) -> None: self._astdict = astdict # Tokenize and create the noop extractor and the position fixer self._tokens: List[Token] = [Token(*i) for i in tokenize.tokenize(BytesIO(codestr.encode('utf-8')).readline)] token_lines = create_tokenized_lines(codestr, self._tokens) self.noops_sync = NoopExtractor(codestr, token_lines) self.pos_sync = LocationFixer(codestr, token_lines) self.codestr = codestr # This will store a dict of nodes to end positions, it will be filled # on parse() self._node2endpos = None self.visit_Global = self.visit_Nonlocal = self._promote_names def parse(self) -> VisitResult: res = self.visit(self._astdict, root=True) return res def _normalize_position(self, node): # Python AST gives a 0 based column for the starting col, bblfsh uses 1-based if "col_offset" in node: node["col_offset"] = max(node.get("col_offset", 1) + 1, 1) if "end_col_offset" in node: node["end_col_offset"] = max(node["end_col_offset"], 1) def _remove_internal(self, node): node.pop('_fields', None) node.pop('_attributes', None) def visit(self, node: Node, root: bool=False) -> VisitResult: # the ctx property always has a "Load"/"Store"/etc dictionary that # can be perfectly converted to a string value since they don't # hold anything more than the name if isinstance(node, dict): node_type = node["ast_type"] if "ctx" in node: node["ctx"] = node["ctx"]["ast_type"] else: node_type = node.__class__.__name__ meth = getattr(self, "visit_" + node_type, self.visit_other) visit_result = meth(node) self.noops_sync.add_noops(node, root) self.pos_sync.sync_node_pos(visit_result) if not self.codestr: # empty files are the only case where 0-indexes are allowed visit_result["col_offset"] = visit_result["end_col_offset"] = \ visit_result["lineno"] = visit_result["end_lineno"] = 0 else: self._normalize_position(visit_result) self._remove_internal(visit_result) return visit_result def visit_str(self, node: Node) -> str: """ This visits str fields inside nodes (which are represented as keys in the node dictionary), not Str AST nodes """ return str(node) def visit_Bytes(self, node: Node) -> VisitResult: try: s = node["s"].decode() encoding = 'utf8' except UnicodeDecodeError: # try with base64 s = encode(node["s"], 'base64').decode().strip() encoding = 'base64' node.update({"s": s, "encoding": encoding}) return node def _promote_names(self, node: Node) -> VisitResult: # Python AST by default stores global and nonlocal variable names # in a "names" array of strings. That breaks the structure of everything # else in the AST (dictionaries, properties or list of objects) so we # convert those names to Name objects names_as_nodes = [self.visit({"ast_type": "Name", "id": i, "lineno": node["lineno"]}) for i in node["names"]] node["names"] = names_as_nodes return node def visit_NameConstant(self, node: Node) -> Node: if "value" in node: repr_val = repr(node["value"]) if repr_val in ("True", "False"): node.update({"LiteralValue": "True" if node["value"] else "False", "ast_type": "BoolLiteral"}) elif repr_val == "None": node = self.visit_NoneType(node) else: node["ast_type"] = "NameConstant" return node def visit_Num(self, node: Node) -> Node: # complex objects are not json-serializable if isinstance(node["n"], complex): node.update({"n": {"real": node["n"].real, "imag": node["n"].imag}}) return node def visit_NoneType(self, node: Node) -> Node: ret = node if node else {} ret.update({"LiteralValue": "None", "ast_type": "NoneLiteral"}) return ret def visit_Attribute(self, node: Node) -> Node: value = deepcopy(node.get("value")) if not value: return node ids: List[Node] = [] while value: new_value = deepcopy(value.get("value")) if new_value: value.pop("value", None) ids.insert(0, self.visit(value)) value = new_value # Append a copy of this node at the end, and change the type of the original node.pop("value", None) node_copy = deepcopy(node) self._remove_internal(node_copy) self._normalize_position(node_copy) ids.append(node_copy) node["ast_type"] = "QualifiedIdentifier" # Copy the position of the first element if len(ids): for key in ("lineno", "end_lineno", "col_offset", "end_col_offset"): if ids[0].get(key): node[key] = ids[0][key] node.pop("attr", None) node["identifiers"] = ids return node def visit_arguments(self, node: Node) -> Node: """ Convert the very odd Python's argument node organization (several different lists for each type and each type's default arguments that you have to right-match) into a more common in other languages single list of types arguments with default values as children of their arg. Also convert Python2's "Name" types inside the arguments to """ def match_default_args(args: List[Node], defaults: List[Node]) -> List[Node]: if defaults: lendiff = len(args) - len(defaults) for i, arg in enumerate(args[lendiff:]): arg["default"] = self.visit(defaults[i]) return args def name2arg(node: Node): # Normalize Python2 and 3 argument types if node["ast_type"] == "Name": node["ast_type"] = "arg" id_ = node.get("id") if id_: node["@token"] = node["id"] del node["id"] norm_args: List[Node] = [] normal_args = deepcopy(node.get("args")) if normal_args: defaults = deepcopy(node.get("defaults")) match_default_args(normal_args, defaults) for i in normal_args: norm_args.append(self.visit(i)) kwonly_args = deepcopy(node.get("kwonlyargs")) if kwonly_args: kw_defaults = deepcopy(node.get("kw_defaults")) match_default_args(kwonly_args, kw_defaults) for a in kwonly_args: a["ast_type"] = "kwonly_arg" for i in kwonly_args: norm_args.append(self.visit(i)) kwarg = deepcopy(node.get("kwarg")) if kwarg: kwarg["ast_type"] = "kwarg" norm_args.append(self.visit(kwarg)) vararg = deepcopy(node.get("vararg")) if vararg: vararg["ast_type"] = "vararg" norm_args.append(self.visit(vararg)) for k in ('defaults', 'kw_defaults', 'args', 'kwonlyargs', 'kwarg', 'vararg'): if k in node: del node[k] for n in norm_args: if "arg" in n: n["@token"] = n["arg"] del n["arg"] name2arg(n) node["args"] = norm_args return node def visit_other(self, node: Node) -> VisitResult: for field in node.get("_fields", []): meth = getattr(self, "visit_" + node["ast_type"], self.visit_other_field) child = node.get(field) if child: node[field] = meth(child) return node def visit_other_field(self, node: Node) -> VisitResult: if isinstance(node, dict): return self.visit(node) elif isinstance(node, list) or isinstance(node, tuple): return [self.visit(x) for x in node] else: # string attribute return node