def optimize_file(data): start = time.time() ast_json = parse_file_server( data["filename"], parser_name="typescript", data={ "remove_types": True, "deps": data["source_files"] }, ) base_time = time.time() - start assert ast_json is not None ref_root = AstNode.fromJson(data["ast"], fields=["gold", "target"]) root = AstNode.fromJson(ast_json, fields=["gold", "target"]) num_diffs = ref_root.num_tree_diffs(root) # print(ref_root.tree_equal(root, verbose=True)) data["dependencies"] = optimize_deps( data["filename"], data["source_files"], data["dependencies"], ast_json, base_time, ) data["ast"] = ast_json data["num_diffs"] = num_diffs return data
def ternary_expr(self, left: AstNode, right: AstNode, depth=0, target_type=None, parent=None): root = self.cond_expr.copyNoChildren(parent=parent) paren_expr = self.paren_expr.copyNoChildren(parent=root) paren_expr.children = [ self.open_paren_token.copyNoChildren(parent=paren_expr), self.gen_bin_expr(depth=depth, target_type=target_type, parent=paren_expr), self.close_paren_token.copyNoChildren(parent=paren_expr), ] left = left.deepCopy(parent=root) right = right.deepCopy(parent=root) root.children = [ paren_expr, self.question_token.copyNoChildren(parent=root), left, self.colon_token.copyNoChildren(parent=root), right, ] """ pick randomly which of left/right should keep target values this ensures that the number of nodes to predict and their order is the same for the original and modified tree """ clear_node = random.choice([left, right]) for node in clear_node.forEachNode(): node.id = -1 node.fields.clear() return root
def find_params(function_def: AstNode): assert function_def.down_first() is not None c = function_def.down_first() while c.type != "OpenParenToken": c = c.right() assert c.has_right() and c.right().type == "SyntaxList" return c.right()
def matches(self, node: AstNode): if node.type != "SyntaxList": return False node = node.left() if node is None or node.type != "FirstPunctuation": return False node = node.up() if node is None or node.type != "ObjectLiteralExpression": return False return True
def apply(self, tree_id, node: AstNode): assert self.matches(node) assert node.id != -1 key = (tree_id, node.id) assert key not in self.applied_positions assert False, "unsound implementation" """ TODO: the current implementation assumes that structural modifications do not change the predictions order. As a result, reordering structural changes will results in wrong evaluation. To fix this, we would need to compute a permutation that reorders the predictions in the original order and apply it during evaluation. """ properties = [ child for child in node.children if child.type != "CommaToken" ] random.shuffle(properties) new_block = AstNode(idx=-1, type="SyntaxList", value=self.null_id) for prop in properties: if new_block.children: comma_token = AstNode(idx=-1, type="CommaToken", value=",") new_block.add_child(comma_token) new_block.add_child(prop) # "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (node, new_block) node.swapNodes(new_block)
def apply(self, tree_id, node: AstNode): # assert self.matches(node) assert node.id != -1 key = (tree_id, node.id) assert key not in self.applied_positions size = random.randint(1, 4) expr = self.gen_array_expression(node, size, random.randint(0, size - 1)) "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (node, expr) node.swapNodes(expr)
def apply(self, tree_id, node: AstNode): # assert self.matches(node) assert node.id != -1 key = (tree_id, node.id) assert key not in self.applied_positions cond_depth = random.randint(0, 3) expr = self.expr_gen.ternary_expr(node, node, cond_depth) "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (node, expr) node.swapNodes(expr) return expr
def gen_constant(self, target_type=None, parent=None): if target_type is None: target_type = random.choice(self.constant_types) node_type = random.choice( self.constant_types_to_node_type[target_type]) node_value = random.choice(self.value_index.per_type_values[node_type]) astnode = AstNode( idx=PositionIDs.ADVERSARIAL_CONSTANT, parent=parent, type=node_type, value=node_value, ) astnode.origin = PositionIDs.ADVERSARIAL_CONSTANT return astnode
def apply(self, tree_id, node: AstNode): assert self.matches(node) def find_args(function_def: AstNode): assert function_def.down_first() is not None c = function_def.down_first() while c.type != "OpenParenToken": c = c.right() assert c.has_right() and c.right().type == "SyntaxList" return c.right() args = find_args(node) new_args = args.deepCopy(parent=args.parent) for i in range(random.randint(1, 2)): if new_args.children: comma_token = AstNode(idx=-1, type="CommaToken", value=",") new_args.add_child(comma_token) expr = self.expr_gen.gen_bin_expr(depth=random.randint(0, 2)) new_args.add_child(expr) assert args.id != -1 key = (tree_id, args.id) assert key not in self.applied_positions "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (args, new_args) args.swapNodes(new_args)
def test_single_node(self): node = AstNode.fromJson([{"id": 0, "type": "Root"}]) self.assertEqual(len(node), 1) self.assertEqual(node.type, "Root") self.assertEqual(node.value, None) self.assertEqual(len(node.children), 0) self.assertEqual(node.parent, None)
def matches(self, node: AstNode): # is_null = str(node.fields.get('target', '<null>') == '<null>') # if node.fields.get('target', '<null>') != '<null>': # return False if AdversarialNodeReplacement.is_constant(node): # self.type_counts['constant' + is_null] += 1 return True if node.type == "PropertyAccessExpression": # self.type_counts['PropertyAccessExpression' + is_null] += 1 return True parent = node.up() if parent is not None: pos = node.pos_in_parent() "x.y -> ((...) ? x : x).y" if pos == 0 and parent.type == "PropertyAccessExpression": # self.type_counts['PropertyAccessExpression v2' + is_null] += 1 return True " x + y -> ((...) ? x : x) + y" if pos != 1 and parent.type == "BinaryExpression": # self.type_counts['BinaryExpression v2' + is_null] += 1 return True if node.type == "BinaryExpression" and parent.type != "ExpressionStatement": # self.type_counts['BinaryExpression' + is_null] += 1 return True "{ y : x }" if pos == 2 and parent.type == "PropertyAssignment": # self.type_counts['PropertyAssignment' + is_null] += 1 return True "return x -> return (...) ? x : x" if node.has_left() and node.left().type in [ "ReturnKeyword", "FirstAssignment" ]: # self.type_counts['left' + is_null] += 1 return True return False
def get_scope_type(node: AstNode): if node.type == "ClassExpression": raw_type = sample.target[node.id] else: node = node.down_first() while node.type not in ["ClassKeyword", "InterfaceKeyword"]: if not node.has_right(): break node = node.right() raw_type = sample.target[node.id] if "typeof" in raw_type: """ converts "'typeof Foo'" to ["'typeof Foo'", "Foo"] """ return [raw_type, raw_type.split(" ")[-1]] # assert 'typeof' in raw_type, '{}\n{}, raw_type: {}'.format(base_node, node, raw_type) return [raw_type]
def apply(self, tree_id, node: AstNode): assert self.matches(node) def find_params(function_def: AstNode): assert function_def.down_first() is not None c = function_def.down_first() while c.type != "OpenParenToken": c = c.right() assert c.has_right() and c.right().type == "SyntaxList" return c.right() params = find_params(node) new_params = params.deepCopy(parent=params.parent) for i in range(random.randint(1, 3)): param = AstNode(idx=-1, type="Parameter", value=self.null_id) # TODO: enable adversarial modifications of the added identifier ident = AstNode(idx=-1, type="Identifier", value="param{}".format(i)) param.add_child(ident) if new_params.children: comma_token = AstNode(idx=-1, type="CommaToken", value=",") new_params.add_child(comma_token) new_params.add_child(param) assert params.id != -1 key = (tree_id, params.id) assert key not in self.applied_positions "remember where the change was applied such that it can be reverted later" self.applied_positions[key] = (params, new_params) params.swapNodes(new_params)
def apply(self, tree_id, node: AstNode): assert self.matches(node) # whether the expression is added before or after is_after = random.choice([True, False]) root = AstNode(idx=-1, type="ExpressionStatement", value=self.null_id) expr = self.expr_gen.gen_bin_expr(depth=random.randint(0, 2)) colon = AstNode(idx=-1, type="SemicolonToken", value=";") root.add_child(expr) root.add_child(colon) assert node.id != -1 key = (tree_id, node.id) assert key not in self.applied_positions # "remember where the change was applied such that it can be reverted later" self.applied_modifications.append((node, root, is_after)) node.parent.add_child(root, pos=node.pos_in_parent() + 1 * is_after)
def test_single_child(self): node = AstNode.fromJson([ { "id": 0, "type": "Root", "children": [1] }, { "id": 1, "type": "Identifier", "value": "x" }, ]) self.assertEqual(len(node), 2) self.assertEqual(node.type, "Root") self.assertEqual(node.value, None) self.assertEqual(len(node.children), 1) self.assertEqual(node.parent, None) child = node.children[0] self.assertEqual(child.type, "Identifier") self.assertEqual(child.value, "x") self.assertEqual(child.parent, node)
def gen_bin_expr(self, depth=0, target_type=None, parent=None): if depth == 0: return self.gen_constant(target_type=target_type, parent=parent) if target_type is None: target_type = random.choice(self.constant_types) bin_op = AstNode(idx=-1, parent=parent, type=self.bin_expr_type, value=self.null_id) bin_operand_type, bin_operand_value = random.choice( self.value_index.bin_ops) if depth == 0: bin_op.children = [ self.gen_constant(target_type=target_type, parent=bin_op), AstNode( idx=-1, parent=bin_op, type=bin_operand_type, value=bin_operand_value, ), self.gen_constant(target_type=target_type, parent=bin_op), ] else: bin_op.children = [ self.gen_bin_expr(random.randint(0, depth - 1), target_type=target_type, parent=bin_op), AstNode( idx=-1, parent=bin_op, type=bin_operand_type, value=bin_operand_value, ), self.gen_bin_expr(random.randint(0, depth - 1), target_type=target_type, parent=bin_op), ] return bin_op
def __init__(self, value_index: NodeValueIndexStr): self.value_index = value_index self.constant_types_to_node_type = { "string": ["StringLiteral", "TemplateExpression", "FirstTemplateToken"], "boolean": ["TrueKeyword", "FalseKeyword"], "number": ["FirstLiteralToken"], } self.constant_types = list(self.constant_types_to_node_type.keys()) self.null_id = "<null>" self.bin_expr_type = "BinaryExpression" self.cond_expr = AstNode(idx=-1, type="ConditionalExpression", value=self.null_id) self.paren_expr = AstNode(idx=-1, type="ParenthesizedExpression", value=self.null_id) self.open_paren_token = AstNode(idx=-1, type="OpenParenToken", value="(") self.close_paren_token = AstNode(idx=-1, type="CloseParenToken", value=")") self.question_token = AstNode(idx=-1, type="QuestionToken", value="?") self.colon_token = AstNode(idx=-1, type="ColonToken", value=":")
def gen_array_expression(self, node, num_elem, select_idx): assert 0 <= select_idx < num_elem elem_access = AstNode(idx=-1, type="ElementAccessExpression", value=self.null_id) array_lit = AstNode(idx=-1, type="ArrayLiteralExpression", value=self.null_id) syntax_list = AstNode(idx=-1, type="SyntaxList", value=self.null_id) for idx in range(num_elem): array_node = node.deepCopy() syntax_list.add_child(array_node) if idx != select_idx: """ this ensures that the number of nodes to predict and their order is the same for the original and modified tree """ for n in array_node.forEachNode(): n.id = -1 n.fields.clear() array_lit.add_child(AstNode(idx=-1, type="OpenBracketToken", value="[")) array_lit.add_child(syntax_list) array_lit.add_child( AstNode(idx=-1, type="CloseBracketToken", value="]")) elem_access.add_child(array_lit) elem_access.add_child( AstNode(idx=-1, type="OpenBracketToken", value="[")) elem_access.add_child( AstNode(idx=-1, type="FirstLiteralToken", value=select_idx)) elem_access.add_child( AstNode(idx=-1, type="CloseBracketToken", value="]")) return elem_access
class ExpressionGenerator: def __init__(self, value_index: NodeValueIndexStr): self.value_index = value_index self.constant_types_to_node_type = { "string": ["StringLiteral", "TemplateExpression", "FirstTemplateToken"], "boolean": ["TrueKeyword", "FalseKeyword"], "number": ["FirstLiteralToken"], } self.constant_types = list(self.constant_types_to_node_type.keys()) self.null_id = "<null>" self.bin_expr_type = "BinaryExpression" self.cond_expr = AstNode(idx=-1, type="ConditionalExpression", value=self.null_id) self.paren_expr = AstNode(idx=-1, type="ParenthesizedExpression", value=self.null_id) self.open_paren_token = AstNode(idx=-1, type="OpenParenToken", value="(") self.close_paren_token = AstNode(idx=-1, type="CloseParenToken", value=")") self.question_token = AstNode(idx=-1, type="QuestionToken", value="?") self.colon_token = AstNode(idx=-1, type="ColonToken", value=":") def gen_constant(self, target_type=None, parent=None): if target_type is None: target_type = random.choice(self.constant_types) node_type = random.choice( self.constant_types_to_node_type[target_type]) node_value = random.choice(self.value_index.per_type_values[node_type]) astnode = AstNode( idx=PositionIDs.ADVERSARIAL_CONSTANT, parent=parent, type=node_type, value=node_value, ) astnode.origin = PositionIDs.ADVERSARIAL_CONSTANT return astnode def gen_bin_expr(self, depth=0, target_type=None, parent=None): if depth == 0: return self.gen_constant(target_type=target_type, parent=parent) if target_type is None: target_type = random.choice(self.constant_types) bin_op = AstNode(idx=-1, parent=parent, type=self.bin_expr_type, value=self.null_id) bin_operand_type, bin_operand_value = random.choice( self.value_index.bin_ops) if depth == 0: bin_op.children = [ self.gen_constant(target_type=target_type, parent=bin_op), AstNode( idx=-1, parent=bin_op, type=bin_operand_type, value=bin_operand_value, ), self.gen_constant(target_type=target_type, parent=bin_op), ] else: bin_op.children = [ self.gen_bin_expr(random.randint(0, depth - 1), target_type=target_type, parent=bin_op), AstNode( idx=-1, parent=bin_op, type=bin_operand_type, value=bin_operand_value, ), self.gen_bin_expr(random.randint(0, depth - 1), target_type=target_type, parent=bin_op), ] return bin_op def ternary_expr(self, left: AstNode, right: AstNode, depth=0, target_type=None, parent=None): root = self.cond_expr.copyNoChildren(parent=parent) paren_expr = self.paren_expr.copyNoChildren(parent=root) paren_expr.children = [ self.open_paren_token.copyNoChildren(parent=paren_expr), self.gen_bin_expr(depth=depth, target_type=target_type, parent=paren_expr), self.close_paren_token.copyNoChildren(parent=paren_expr), ] left = left.deepCopy(parent=root) right = right.deepCopy(parent=root) root.children = [ paren_expr, self.question_token.copyNoChildren(parent=root), left, self.colon_token.copyNoChildren(parent=root), right, ] """ pick randomly which of left/right should keep target values this ensures that the number of nodes to predict and their order is the same for the original and modified tree """ clear_node = random.choice([left, right]) for node in clear_node.forEachNode(): node.id = -1 node.fields.clear() return root
def apply(self, tree_id, node: AstNode): assert self.matches(node) # whether the expression is added before or after is_after = random.choice([True, False]) values = set() for n in node.parent.forEachNode(): if n.type == "Identifier": values.add(n.value) values = list(values) root = AstNode(idx=-1, type="ExpressionStatement", value=self.null_id) obj = AstNode(idx=-1, type="ObjectLiteralExpression", value=self.null_id) root.add_child(obj) n = AstNode(idx=-1, type="FirstPunctuation", value="{") obj.add_child(n) block = AstNode(idx=-1, type="SyntaxList", value=self.null_id) obj.add_child(block) for i in range(random.randint(1, 5)): if i != 0: n = AstNode(idx=-1, type="CommaToken", value=",") block.add_child(n) prop = AstNode(idx=-1, type="PropertyAssignment", value=self.null_id) if not values or random.random() > 0.5: n = self.expr_gen.gen_constant() prop.add_child(n) else: n = AstNode(idx=-1, type="Identifier", value=random.choice(values)) prop.add_child(n) n = AstNode(idx=-1, type="ColonToken", value=":") prop.add_child(n) if not values or random.random() > 0.5: n = self.expr_gen.gen_constant() prop.add_child(n) else: n = AstNode(idx=-1, type="Identifier", value=random.choice(values)) prop.add_child(n) block.add_child(prop) n = AstNode(idx=-1, type="CloseBraceToken", value="}") block.add_child(n) colon = AstNode(idx=-1, type="SemicolonToken", value=";") root.add_child(colon) assert node.id != -1 key = (tree_id, node.id) assert key not in self.applied_positions # "remember where the change was applied such that it can be reverted later" self.applied_modifications.append((node, root, is_after)) node.parent.add_child(root, pos=node.pos_in_parent() + 1 * is_after)